{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998401789995205, "eval_steps": 500, "global_step": 4692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1327.1429443359375, "completions/mean_terminated_length": 815.3893432617188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00021309466730595067, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11880554905295186, "kl": 0.0005807876586914062, "learning_rate": 0.0, "loss": 0.0525, "num_tokens": 659824.0, "reward": 0.590401828289032, "reward_std": 0.417092889547348, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1439732164144516, "rewards/tag_count_reward/std": 0.18524260818958282, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1581.8817138671875, "completions/mean_terminated_length": 1039.202880859375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.00042618933461190133, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.9835778773808289, "kl": 0.0007829666137695312, "learning_rate": 2.127659574468085e-09, "loss": 0.0479, "num_tokens": 1443227.0, "reward": 0.51953125, "reward_std": 0.3486446738243103, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1244419664144516, "rewards/tag_count_reward/std": 0.15507708489894867, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1447.96435546875, "completions/mean_terminated_length": 932.5809326171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.000639284001917852, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10499796417370877, "kl": 0.0006656646728515625, "learning_rate": 4.25531914893617e-09, "loss": 0.0887, "num_tokens": 2169131.0, "reward": 0.5764509439468384, "reward_std": 0.427979052066803, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.17117878794670105, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1508.450927734375, "completions/mean_terminated_length": 913.1737060546875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0008523786692238027, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1018359279083423, "kl": 0.0005950927734375, "learning_rate": 6.382978723404255e-09, "loss": 0.0928, "num_tokens": 2914437.0, "reward": 0.431919664144516, "reward_std": 0.36637213826179504, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45639166235923767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1372767835855484, "rewards/tag_count_reward/std": 0.18277305364608765, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1506.0626220703125, "completions/mean_terminated_length": 863.6682739257812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0010654733365297534, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.21057650648045662, "kl": 0.0006418228149414062, "learning_rate": 8.51063829787234e-09, "loss": 0.0709, "num_tokens": 3657233.0, "reward": 0.4151785969734192, "reward_std": 0.30718994140625, "rewards/accuracy_reward/mean": 0.2857142984867096, "rewards/accuracy_reward/std": 0.45225897431373596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1294642835855484, "rewards/tag_count_reward/std": 0.18765640258789062, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1621.5157470703125, "completions/mean_terminated_length": 846.3333129882812, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.001278568003835704, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09960264735253337, "kl": 0.0006780624389648438, "learning_rate": 1.0638297872340425e-08, "loss": 0.0792, "num_tokens": 4455016.0, "reward": 0.3738839328289032, "reward_std": 0.3686589002609253, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44215917587280273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1082589253783226, "rewards/tag_count_reward/std": 0.16469375789165497, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1571.7098388671875, "completions/mean_terminated_length": 835.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0014916626711416548, "frac_reward_zero_std": 0.0, "grad_norm": 0.10830495262519961, "kl": 0.0005960464477539062, "learning_rate": 1.276595744680851e-08, "loss": 0.06, "num_tokens": 5233366.0, "reward": 0.291294664144516, "reward_std": 0.35457053780555725, "rewards/accuracy_reward/mean": 0.1808035671710968, "rewards/accuracy_reward/std": 0.3852855861186981, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1104910746216774, "rewards/tag_count_reward/std": 0.1802973598241806, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1478.154052734375, "completions/mean_terminated_length": 997.4197387695312, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.0017047573384476053, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1165773576803505, "kl": 0.0007266998291015625, "learning_rate": 1.4893617021276595e-08, "loss": 0.0932, "num_tokens": 5962123.0, "reward": 0.5491071939468384, "reward_std": 0.38611385226249695, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.16899244487285614, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1435.9063720703125, "completions/mean_terminated_length": 959.8333740234375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.001917852005753556, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10603288197538065, "kl": 0.0006589889526367188, "learning_rate": 1.702127659574468e-08, "loss": 0.0621, "num_tokens": 6686481.0, "reward": 0.590401828289032, "reward_std": 0.3635442852973938, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1506696492433548, "rewards/tag_count_reward/std": 0.1734941452741623, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1382.2098388671875, "completions/mean_terminated_length": 854.904052734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.002130946673059507, "frac_reward_zero_std": 0.0, "grad_norm": 0.11471021646683922, "kl": 0.000652313232421875, "learning_rate": 1.9148936170212764e-08, "loss": 0.0799, "num_tokens": 7371935.0, "reward": 0.582589328289032, "reward_std": 0.40267178416252136, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1584821492433548, "rewards/tag_count_reward/std": 0.19785068929195404, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1536.4888916015625, "completions/mean_terminated_length": 940.9613647460938, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.0023440413403654574, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09045626739258034, "kl": 0.0005979537963867188, "learning_rate": 2.127659574468085e-08, "loss": 0.11, "num_tokens": 8126682.0, "reward": 0.4575892984867096, "reward_std": 0.35018956661224365, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1272321492433548, "rewards/tag_count_reward/std": 0.17537283897399902, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1438.700927734375, "completions/mean_terminated_length": 901.0841064453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.002557136007671408, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1092851498354237, "kl": 0.0005674362182617188, "learning_rate": 2.3404255319148933e-08, "loss": 0.0486, "num_tokens": 8838468.0, "reward": 0.6579241156578064, "reward_std": 0.3641918897628784, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1623883992433548, "rewards/tag_count_reward/std": 0.18918608129024506, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1497.3460693359375, "completions/mean_terminated_length": 895.2289428710938, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0027702306749773586, "frac_reward_zero_std": 0.0, "grad_norm": 0.11315634547553356, "kl": 0.0006742477416992188, "learning_rate": 2.553191489361702e-08, "loss": 0.0854, "num_tokens": 9586271.0, "reward": 0.490513414144516, "reward_std": 0.3397141993045807, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1266741007566452, "rewards/tag_count_reward/std": 0.16298216581344604, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1476.227783203125, "completions/mean_terminated_length": 924.517578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0029833253422833095, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09741794238486032, "kl": 0.00063323974609375, "learning_rate": 2.7659574468085105e-08, "loss": 0.0669, "num_tokens": 10313509.0, "reward": 0.5172991156578064, "reward_std": 0.4070625603199005, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1356026828289032, "rewards/tag_count_reward/std": 0.1774454265832901, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1312.8929443359375, "completions/mean_terminated_length": 859.0902709960938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00319642000958926, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11056495464072837, "kl": 0.0006198883056640625, "learning_rate": 2.978723404255319e-08, "loss": 0.0777, "num_tokens": 10973525.0, "reward": 0.5881696939468384, "reward_std": 0.41655388474464417, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1551339328289032, "rewards/tag_count_reward/std": 0.19554999470710754, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1393.80810546875, "completions/mean_terminated_length": 841.9176635742188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0034095146768952107, "frac_reward_zero_std": 0.0, "grad_norm": 0.10355788177332209, "kl": 0.0005626678466796875, "learning_rate": 3.191489361702127e-08, "loss": 0.056, "num_tokens": 11668783.0, "reward": 0.5334821939468384, "reward_std": 0.3836354911327362, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1540178507566452, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1613.8504638671875, "completions/mean_terminated_length": 776.7647094726562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0036226093442011612, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.09515067486346335, "kl": 0.0006103515625, "learning_rate": 3.404255319148936e-08, "loss": 0.0763, "num_tokens": 12459692.0, "reward": 0.3895089328289032, "reward_std": 0.32996246218681335, "rewards/accuracy_reward/mean": 0.2901785671710968, "rewards/accuracy_reward/std": 0.4543520212173462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0993303582072258, "rewards/tag_count_reward/std": 0.17105932533740997, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1384.009033203125, "completions/mean_terminated_length": 823.851806640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.003835704011507112, "frac_reward_zero_std": 0.0, "grad_norm": 0.10973826470058803, "kl": 0.00054168701171875, "learning_rate": 3.617021276595744e-08, "loss": 0.0804, "num_tokens": 13153088.0, "reward": 0.4603794813156128, "reward_std": 0.343288779258728, "rewards/accuracy_reward/mean": 0.3258928656578064, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1344866007566452, "rewards/tag_count_reward/std": 0.1686217337846756, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1576.8013916015625, "completions/mean_terminated_length": 1002.96533203125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.004048798678813062, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09850193451964769, "kl": 0.0006160736083984375, "learning_rate": 3.829787234042553e-08, "loss": 0.0535, "num_tokens": 13929223.0, "reward": 0.4715402126312256, "reward_std": 0.3523045480251312, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1277901828289032, "rewards/tag_count_reward/std": 0.1800851821899414, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1434.8929443359375, "completions/mean_terminated_length": 793.7899169921875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.004261893346119014, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10200327428815918, "kl": 0.0005636215209960938, "learning_rate": 4.042553191489362e-08, "loss": 0.0899, "num_tokens": 14655543.0, "reward": 0.4693080484867096, "reward_std": 0.36909398436546326, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399619221687317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1300223171710968, "rewards/tag_count_reward/std": 0.19278773665428162, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1556.779052734375, "completions/mean_terminated_length": 871.171142578125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.004474988013424964, "frac_reward_zero_std": 0.0, "grad_norm": 0.09078999703545892, "kl": 0.0005154609680175781, "learning_rate": 4.25531914893617e-08, "loss": 0.0339, "num_tokens": 15421684.0, "reward": 0.3738839328289032, "reward_std": 0.3145078122615814, "rewards/accuracy_reward/mean": 0.26157405972480774, "rewards/accuracy_reward/std": 0.4400014281272888, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1216517835855484, "rewards/tag_count_reward/std": 0.17773102223873138, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1498.372802734375, "completions/mean_terminated_length": 869.8516235351562, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.004688082680730915, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10555568921525868, "kl": 0.0006399154663085938, "learning_rate": 4.4680851063829786e-08, "loss": 0.108, "num_tokens": 16160331.0, "reward": 0.490513414144516, "reward_std": 0.36637213826179504, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1311383992433548, "rewards/tag_count_reward/std": 0.17686758935451508, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1437.4598388671875, "completions/mean_terminated_length": 940.62353515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0049011773480368654, "frac_reward_zero_std": 0.0, "grad_norm": 0.11060612805320082, "kl": 0.000606536865234375, "learning_rate": 4.6808510638297865e-08, "loss": 0.0842, "num_tokens": 16877817.0, "reward": 0.535714328289032, "reward_std": 0.39839938282966614, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1473214328289032, "rewards/tag_count_reward/std": 0.1922827959060669, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1459.2344970703125, "completions/mean_terminated_length": 930.3432006835938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.005114272015342816, "frac_reward_zero_std": 0.0, "grad_norm": 0.10436240015138523, "kl": 0.0006122589111328125, "learning_rate": 4.893617021276596e-08, "loss": 0.0704, "num_tokens": 17603042.0, "reward": 0.5412946939468384, "reward_std": 0.4264402985572815, "rewards/accuracy_reward/mean": 0.3958333432674408, "rewards/accuracy_reward/std": 0.4895959198474884, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1595982164144516, "rewards/tag_count_reward/std": 0.19408898055553436, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1578.3148193359375, "completions/mean_terminated_length": 817.4795532226562, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0053273666826487666, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09628061315971112, "kl": 0.0006122589111328125, "learning_rate": 5.106382978723404e-08, "loss": 0.0914, "num_tokens": 18378399.0, "reward": 0.4207589328289032, "reward_std": 0.2939138412475586, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45739173889160156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1238839253783226, "rewards/tag_count_reward/std": 0.18470267951488495, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1626.9888916015625, "completions/mean_terminated_length": 994.2960815429688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.005540461349954717, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09993865678887126, "kl": 0.0006809234619140625, "learning_rate": 5.3191489361702123e-08, "loss": 0.0513, "num_tokens": 19174554.0, "reward": 0.4988839626312256, "reward_std": 0.3830823004245758, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1194196417927742, "rewards/tag_count_reward/std": 0.16029927134513855, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1538.794677734375, "completions/mean_terminated_length": 884.10205078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.005753556017260668, "frac_reward_zero_std": 0.0, "grad_norm": 0.10245408847716521, "kl": 0.0005316734313964844, "learning_rate": 5.531914893617021e-08, "loss": 0.0669, "num_tokens": 19932558.0, "reward": 0.4469866156578064, "reward_std": 0.3912772238254547, "rewards/accuracy_reward/mean": 0.3169642984867096, "rewards/accuracy_reward/std": 0.4658135175704956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1300223171710968, "rewards/tag_count_reward/std": 0.18912668526172638, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1617.4754638671875, "completions/mean_terminated_length": 952.1193237304688, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.005966650684566619, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.08597152328544956, "kl": 0.0005950927734375, "learning_rate": 5.7446808510638295e-08, "loss": 0.0604, "num_tokens": 20728131.0, "reward": 0.4296875298023224, "reward_std": 0.3333337903022766, "rewards/accuracy_reward/mean": 0.3102678656578064, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1194196417927742, "rewards/tag_count_reward/std": 0.187626451253891, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1547.743408203125, "completions/mean_terminated_length": 874.623046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.00617974535187257, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3009094483530404, "kl": 0.0006208419799804688, "learning_rate": 5.957446808510638e-08, "loss": 0.082, "num_tokens": 21497520.0, "reward": 0.431919664144516, "reward_std": 0.34356507658958435, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46031373739242554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1283482164144516, "rewards/tag_count_reward/std": 0.1999439299106598, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1552.841552734375, "completions/mean_terminated_length": 976.3526611328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00639284001917852, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11999954723943836, "kl": 0.000629425048828125, "learning_rate": 6.170212765957446e-08, "loss": 0.0996, "num_tokens": 22260553.0, "reward": 0.5078125, "reward_std": 0.3757893443107605, "rewards/accuracy_reward/mean": 0.38657405972480774, "rewards/accuracy_reward/std": 0.4875292479991913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1350446492433548, "rewards/tag_count_reward/std": 0.1782640665769577, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1555.6273193359375, "completions/mean_terminated_length": 868.4118041992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.006605934686484471, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09875543353978837, "kl": 0.000629425048828125, "learning_rate": 6.382978723404254e-08, "loss": 0.1085, "num_tokens": 23034354.0, "reward": 0.420200914144516, "reward_std": 0.40927058458328247, "rewards/accuracy_reward/mean": 0.2991071343421936, "rewards/accuracy_reward/std": 0.45837873220443726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.12109375, "rewards/tag_count_reward/std": 0.1800643801689148, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1567.5826416015625, "completions/mean_terminated_length": 921.1571044921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.006819029353790421, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09796451753719258, "kl": 0.0006122589111328125, "learning_rate": 6.595744680851063e-08, "loss": 0.1041, "num_tokens": 23808775.0, "reward": 0.4626116156578064, "reward_std": 0.36975887417793274, "rewards/accuracy_reward/mean": 0.35648149251937866, "rewards/accuracy_reward/std": 0.47951504588127136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1188616082072258, "rewards/tag_count_reward/std": 0.17205896973609924, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1345.7857666015625, "completions/mean_terminated_length": 842.6666259765625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.007032124021096372, "frac_reward_zero_std": 0.0, "grad_norm": 0.12196101026133285, "kl": 0.000682830810546875, "learning_rate": 6.808510638297873e-08, "loss": 0.0889, "num_tokens": 24475687.0, "reward": 0.6467634439468384, "reward_std": 0.4578225910663605, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16015625, "rewards/tag_count_reward/std": 0.1836206167936325, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1346.9866943359375, "completions/mean_terminated_length": 658.3805541992188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0072452186884023224, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10745798801090463, "kl": 0.0006198883056640625, "learning_rate": 7.02127659574468e-08, "loss": 0.0601, "num_tokens": 25150641.0, "reward": 0.5072544813156128, "reward_std": 0.3134019076824188, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1233258917927742, "rewards/tag_count_reward/std": 0.16721661388874054, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1566.732177734375, "completions/mean_terminated_length": 888.8171997070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.007458313355708273, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12964865572439357, "kl": 0.0006628036499023438, "learning_rate": 7.234042553191488e-08, "loss": 0.0827, "num_tokens": 25920777.0, "reward": 0.3839285969734192, "reward_std": 0.3860481381416321, "rewards/accuracy_reward/mean": 0.2611607015132904, "rewards/accuracy_reward/std": 0.43975841999053955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1227678582072258, "rewards/tag_count_reward/std": 0.1800929754972458, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1405.2188720703125, "completions/mean_terminated_length": 891.510009765625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.007671408023014224, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1151012199106129, "kl": 0.0005998611450195312, "learning_rate": 7.446808510638298e-08, "loss": 0.0638, "num_tokens": 26628475.0, "reward": 0.6545759439468384, "reward_std": 0.3889673948287964, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1545758992433548, "rewards/tag_count_reward/std": 0.1728697121143341, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1378.7232666015625, "completions/mean_terminated_length": 961.6376953125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.007884502690320174, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10097242288441606, "kl": 0.0005893707275390625, "learning_rate": 7.659574468085106e-08, "loss": 0.0848, "num_tokens": 27312959.0, "reward": 0.6194196939468384, "reward_std": 0.3824792504310608, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1573660671710968, "rewards/tag_count_reward/std": 0.18260905146598816, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1446.9710693359375, "completions/mean_terminated_length": 856.5796508789062, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.008097597357626125, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.30447813766076604, "kl": 0.000576019287109375, "learning_rate": 7.872340425531915e-08, "loss": 0.0205, "num_tokens": 28031314.0, "reward": 0.5111607313156128, "reward_std": 0.36669179797172546, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1473214328289032, "rewards/tag_count_reward/std": 0.19082291424274445, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1713.3751220703125, "completions/mean_terminated_length": 1116.86962890625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.008310692024932075, "frac_reward_zero_std": 0.0, "grad_norm": 0.10262973183741655, "kl": 0.0006504058837890625, "learning_rate": 8.085106382978724e-08, "loss": 0.0872, "num_tokens": 28871850.0, "reward": 0.4151785969734192, "reward_std": 0.38343989849090576, "rewards/accuracy_reward/mean": 0.2879464328289032, "rewards/accuracy_reward/std": 0.4533122181892395, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1272321492433548, "rewards/tag_count_reward/std": 0.17215418815612793, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1496.3929443359375, "completions/mean_terminated_length": 934.8468627929688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.008523786692238028, "frac_reward_zero_std": 0.0, "grad_norm": 0.8394464547555742, "kl": 0.0006561279296875, "learning_rate": 8.297872340425531e-08, "loss": 0.0913, "num_tokens": 29614746.0, "reward": 0.5111607313156128, "reward_std": 0.4580632746219635, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1294642835855484, "rewards/tag_count_reward/std": 0.17612579464912415, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1538.446533203125, "completions/mean_terminated_length": 807.3478393554688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.008736881359543978, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.08706024487907357, "kl": 0.0006227493286132812, "learning_rate": 8.51063829787234e-08, "loss": 0.0651, "num_tokens": 30370306.0, "reward": 0.4648437798023224, "reward_std": 0.24940386414527893, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399619221687317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1255580335855484, "rewards/tag_count_reward/std": 0.16554337739944458, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1661.513427734375, "completions/mean_terminated_length": 945.1592407226562, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.008949976026849929, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.09806317826615497, "kl": 0.0006170272827148438, "learning_rate": 8.723404255319149e-08, "loss": 0.0629, "num_tokens": 31193624.0, "reward": 0.4073660969734192, "reward_std": 0.2731107175350189, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.46403056383132935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0948660746216774, "rewards/tag_count_reward/std": 0.14553913474082947, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1494.930908203125, "completions/mean_terminated_length": 956.4801635742188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00916307069415588, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.0128460757898141, "kl": 0.000698089599609375, "learning_rate": 8.936170212765957e-08, "loss": 0.0503, "num_tokens": 31932217.0, "reward": 0.5189732313156128, "reward_std": 0.35851016640663147, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1462053507566452, "rewards/tag_count_reward/std": 0.17885133624076843, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1457.99560546875, "completions/mean_terminated_length": 867.9910888671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.00937616536146183, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11776272944133993, "kl": 0.0006351470947265625, "learning_rate": 9.148936170212765e-08, "loss": 0.1127, "num_tokens": 32652839.0, "reward": 0.5340402126312256, "reward_std": 0.43986910581588745, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1545758992433548, "rewards/tag_count_reward/std": 0.18686223030090332, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1485.26123046875, "completions/mean_terminated_length": 912.3829345703125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.00958926002876778, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1035863264780186, "kl": 0.00067138671875, "learning_rate": 9.361702127659573e-08, "loss": 0.0447, "num_tokens": 33387820.0, "reward": 0.4587053656578064, "reward_std": 0.36645394563674927, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1216517835855484, "rewards/tag_count_reward/std": 0.16802562773227692, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1382.1876220703125, "completions/mean_terminated_length": 887.3618774414062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.009802354696073731, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11381146694180108, "kl": 0.0007276535034179688, "learning_rate": 9.574468085106382e-08, "loss": 0.0277, "num_tokens": 34072976.0, "reward": 0.5853794813156128, "reward_std": 0.41184261441230774, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1568080335855484, "rewards/tag_count_reward/std": 0.18423150479793549, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1514.1473388671875, "completions/mean_terminated_length": 881.3366088867188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.010015449363379681, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09565738552322595, "kl": 0.0006103515625, "learning_rate": 9.787234042553192e-08, "loss": 0.1223, "num_tokens": 34824370.0, "reward": 0.4804687798023224, "reward_std": 0.360610693693161, "rewards/accuracy_reward/mean": 0.35185185074806213, "rewards/accuracy_reward/std": 0.4781017303466797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1411830335855484, "rewards/tag_count_reward/std": 0.19434207677841187, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1428.7054443359375, "completions/mean_terminated_length": 792.5972900390625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.010228544030685632, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.2438327554885991, "kl": 0.0006103515625, "learning_rate": 1e-07, "loss": 0.0925, "num_tokens": 35537726.0, "reward": 0.4614955484867096, "reward_std": 0.33448395133018494, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1333705335855484, "rewards/tag_count_reward/std": 0.18375654518604279, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1458.5804443359375, "completions/mean_terminated_length": 879.5928955078125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.010441638697991583, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.0977685531678709, "kl": 0.0006132125854492188, "learning_rate": 1.0212765957446807e-07, "loss": 0.0798, "num_tokens": 36260242.0, "reward": 0.5602678656578064, "reward_std": 0.3839297592639923, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1383928507566452, "rewards/tag_count_reward/std": 0.17164580523967743, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1551.4442138671875, "completions/mean_terminated_length": 978.4952392578125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.010654733365297533, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.4952207656777053, "kl": 0.0005893707275390625, "learning_rate": 1.0425531914893617e-07, "loss": 0.0671, "num_tokens": 37026313.0, "reward": 0.4910714626312256, "reward_std": 0.3419077694416046, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1272321492433548, "rewards/tag_count_reward/std": 0.16637177765369415, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1255.325927734375, "completions/mean_terminated_length": 779.721435546875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.010867828032603484, "frac_reward_zero_std": 0.0, "grad_norm": 0.1412960599140765, "kl": 0.0006103515625, "learning_rate": 1.0638297872340425e-07, "loss": 0.0137, "num_tokens": 37653003.0, "reward": 0.5574777126312256, "reward_std": 0.40723317861557007, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1646205335855484, "rewards/tag_count_reward/std": 0.1945667564868927, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1410.7991943359375, "completions/mean_terminated_length": 843.5020751953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.011080922699909434, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11296164846364418, "kl": 0.0006666183471679688, "learning_rate": 1.0851063829787234e-07, "loss": 0.0584, "num_tokens": 38347265.0, "reward": 0.5652902126312256, "reward_std": 0.3664935827255249, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.13671875, "rewards/tag_count_reward/std": 0.1684810072183609, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1343.493408203125, "completions/mean_terminated_length": 883.354248046875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.011294017367215385, "frac_reward_zero_std": 0.0, "grad_norm": 0.1357720870845481, "kl": 0.0006160736083984375, "learning_rate": 1.1063829787234042e-07, "loss": 0.1081, "num_tokens": 39007150.0, "reward": 0.6556919813156128, "reward_std": 0.40325039625167847, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1668526828289032, "rewards/tag_count_reward/std": 0.17356158792972565, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1412.3817138671875, "completions/mean_terminated_length": 836.26806640625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.011507112034521335, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10810936118624642, "kl": 0.00066375732421875, "learning_rate": 1.127659574468085e-07, "loss": 0.0865, "num_tokens": 39704041.0, "reward": 0.52734375, "reward_std": 0.32285118103027344, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.17759311199188232, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1373.024658203125, "completions/mean_terminated_length": 889.4214477539062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.011720206701827288, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12102843929228482, "kl": 0.0006570816040039062, "learning_rate": 1.1489361702127659e-07, "loss": 0.0903, "num_tokens": 40391124.0, "reward": 0.6618303656578064, "reward_std": 0.38166266679763794, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1529017835855484, "rewards/tag_count_reward/std": 0.16740036010742188, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1613.6629638671875, "completions/mean_terminated_length": 1001.8548583984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.011933301369133238, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15827625867598152, "kl": 0.00077056884765625, "learning_rate": 1.1702127659574468e-07, "loss": 0.0564, "num_tokens": 41188589.0, "reward": 0.4810267984867096, "reward_std": 0.4002097249031067, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1149553582072258, "rewards/tag_count_reward/std": 0.15654832124710083, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1444.0535888671875, "completions/mean_terminated_length": 891.7265625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.012146396036439189, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.34802836790378183, "kl": 0.000701904296875, "learning_rate": 1.1914893617021276e-07, "loss": 0.0279, "num_tokens": 41907045.0, "reward": 0.5736607313156128, "reward_std": 0.3971804976463318, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.15963202714920044, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1524.138427734375, "completions/mean_terminated_length": 868.6532592773438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.01235949070374514, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1009940678802228, "kl": 0.00066375732421875, "learning_rate": 1.2127659574468084e-07, "loss": 0.078, "num_tokens": 42659059.0, "reward": 0.3900669813156128, "reward_std": 0.3522178530693054, "rewards/accuracy_reward/mean": 0.2566964328289032, "rewards/accuracy_reward/std": 0.4372987151145935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1333705335855484, "rewards/tag_count_reward/std": 0.18451586365699768, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1516.0491943359375, "completions/mean_terminated_length": 832.1122436523438, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.01257258537105109, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10333814503815872, "kl": 0.0006532669067382812, "learning_rate": 1.2340425531914892e-07, "loss": 0.0765, "num_tokens": 43409465.0, "reward": 0.3671875298023224, "reward_std": 0.2861935496330261, "rewards/accuracy_reward/mean": 0.2455357164144516, "rewards/accuracy_reward/std": 0.4308854937553406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1216517835855484, "rewards/tag_count_reward/std": 0.1792975217103958, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1369.654052734375, "completions/mean_terminated_length": 827.5220336914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.01278568003835704, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10919075700279716, "kl": 0.0006704330444335938, "learning_rate": 1.25531914893617e-07, "loss": 0.0528, "num_tokens": 44090958.0, "reward": 0.5178571939468384, "reward_std": 0.39728257060050964, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1339285671710968, "rewards/tag_count_reward/std": 0.17355531454086304, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1584.8035888671875, "completions/mean_terminated_length": 901.52490234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.012998774705662991, "frac_reward_zero_std": 0.0, "grad_norm": 0.10452449253069136, "kl": 0.0005750656127929688, "learning_rate": 1.2765957446808508e-07, "loss": 0.0925, "num_tokens": 44871766.0, "reward": 0.3822544813156128, "reward_std": 0.3722384572029114, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.43349677324295044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1322544664144516, "rewards/tag_count_reward/std": 0.19702155888080597, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1499.5335693359375, "completions/mean_terminated_length": 807.0252685546875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.013211869372968942, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10269737040638459, "kl": 0.0006465911865234375, "learning_rate": 1.2978723404255319e-07, "loss": 0.0809, "num_tokens": 45616389.0, "reward": 0.4665178656578064, "reward_std": 0.3160642385482788, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1138392835855484, "rewards/tag_count_reward/std": 0.16601121425628662, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1543.1629638671875, "completions/mean_terminated_length": 986.18310546875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.013424964040274892, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14117659955961492, "kl": 0.0006532669067382812, "learning_rate": 1.3191489361702127e-07, "loss": 0.0802, "num_tokens": 46379694.0, "reward": 0.4854910969734192, "reward_std": 0.35438334941864014, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1283482164144516, "rewards/tag_count_reward/std": 0.15772415697574615, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1556.7210693359375, "completions/mean_terminated_length": 942.0050048828125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.013638058707580843, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10516515272337294, "kl": 0.0006771087646484375, "learning_rate": 1.3404255319148934e-07, "loss": 0.0218, "num_tokens": 47145953.0, "reward": 0.4017857313156128, "reward_std": 0.3413859009742737, "rewards/accuracy_reward/mean": 0.2790178656578064, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1227678582072258, "rewards/tag_count_reward/std": 0.16721008718013763, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1427.5001220703125, "completions/mean_terminated_length": 894.5394897460938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.013851153374886793, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14553769231051605, "kl": 0.0005903244018554688, "learning_rate": 1.3617021276595745e-07, "loss": 0.0675, "num_tokens": 47849697.0, "reward": 0.5027902126312256, "reward_std": 0.3909113109111786, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.4822677969932556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.13671875, "rewards/tag_count_reward/std": 0.19394339621067047, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1459.2344970703125, "completions/mean_terminated_length": 966.9876708984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.014064248042192744, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10345709313267537, "kl": 0.0006418228149414062, "learning_rate": 1.3829787234042553e-07, "loss": 0.0603, "num_tokens": 48568570.0, "reward": 0.5334821939468384, "reward_std": 0.3349814713001251, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1428571492433548, "rewards/tag_count_reward/std": 0.16372442245483398, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1437.435302734375, "completions/mean_terminated_length": 922.3497924804688, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.014277342709498694, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1616650053839849, "kl": 0.0005855560302734375, "learning_rate": 1.404255319148936e-07, "loss": 0.0946, "num_tokens": 49271421.0, "reward": 0.5719866156578064, "reward_std": 0.34255507588386536, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1568080335855484, "rewards/tag_count_reward/std": 0.19671083986759186, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1536.024658203125, "completions/mean_terminated_length": 895.4120483398438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.014490437376804645, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.23135387511604344, "kl": 0.0006227493286132812, "learning_rate": 1.4255319148936172e-07, "loss": 0.0724, "num_tokens": 50038696.0, "reward": 0.4469866156578064, "reward_std": 0.3783811330795288, "rewards/accuracy_reward/mean": 0.3258928656578064, "rewards/accuracy_reward/std": 0.46923142671585083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.12109375, "rewards/tag_count_reward/std": 0.17293468117713928, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1371.52685546875, "completions/mean_terminated_length": 891.282470703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.014703532044110595, "frac_reward_zero_std": 0.0, "grad_norm": 0.10727099539961585, "kl": 0.00057220458984375, "learning_rate": 1.4468085106382977e-07, "loss": 0.0731, "num_tokens": 50723620.0, "reward": 0.5245535969734192, "reward_std": 0.3711250424385071, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1629464328289032, "rewards/tag_count_reward/std": 0.19346082210540771, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1441.6451416015625, "completions/mean_terminated_length": 901.81005859375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.014916626711416546, "frac_reward_zero_std": 0.0, "grad_norm": 0.10465712838302302, "kl": 0.0006055831909179688, "learning_rate": 1.4680851063829785e-07, "loss": 0.0462, "num_tokens": 51430517.0, "reward": 0.5831473469734192, "reward_std": 0.310307115316391, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1411830335855484, "rewards/tag_count_reward/std": 0.157814159989357, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1391.1094970703125, "completions/mean_terminated_length": 831.93798828125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.015129721378722498, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1106227567249354, "kl": 0.0005865097045898438, "learning_rate": 1.4893617021276595e-07, "loss": 0.0918, "num_tokens": 52123830.0, "reward": 0.5452009439468384, "reward_std": 0.3809743821620941, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1501116007566452, "rewards/tag_count_reward/std": 0.182987242937088, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1392.232177734375, "completions/mean_terminated_length": 991.2230224609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.015342816046028449, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10862713855052201, "kl": 0.0006055831909179688, "learning_rate": 1.5106382978723403e-07, "loss": 0.033, "num_tokens": 52812910.0, "reward": 0.6010044813156128, "reward_std": 0.35854873061180115, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1568080335855484, "rewards/tag_count_reward/std": 0.1700226068496704, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1403.3326416015625, "completions/mean_terminated_length": 937.1884765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0155559107133344, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11132467714117722, "kl": 0.0006723403930664062, "learning_rate": 1.531914893617021e-07, "loss": 0.0977, "num_tokens": 53513347.0, "reward": 0.641183078289032, "reward_std": 0.371239572763443, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1657366007566452, "rewards/tag_count_reward/std": 0.18092206120491028, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1589.3192138671875, "completions/mean_terminated_length": 912.7017211914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.015769005380640348, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09876810544299988, "kl": 0.0006694793701171875, "learning_rate": 1.5531914893617022e-07, "loss": 0.0526, "num_tokens": 54292754.0, "reward": 0.459263414144516, "reward_std": 0.4004373848438263, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1244419664144516, "rewards/tag_count_reward/std": 0.18242010474205017, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1407.1920166015625, "completions/mean_terminated_length": 930.9494018554688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0159821000479463, "frac_reward_zero_std": 0.0, "grad_norm": 0.13130369092024977, "kl": 0.0006513595581054688, "learning_rate": 1.574468085106383e-07, "loss": 0.0654, "num_tokens": 54988840.0, "reward": 0.5859375, "reward_std": 0.33688119053840637, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1529017835855484, "rewards/tag_count_reward/std": 0.17555426061153412, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1507.524658203125, "completions/mean_terminated_length": 855.2265625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.01619519471525225, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10490041011979272, "kl": 0.0006437301635742188, "learning_rate": 1.5957446808510638e-07, "loss": 0.0503, "num_tokens": 55730531.0, "reward": 0.504464328289032, "reward_std": 0.33776572346687317, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1495535671710968, "rewards/tag_count_reward/std": 0.21339333057403564, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1523.560302734375, "completions/mean_terminated_length": 923.842041015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0164082893825582, "frac_reward_zero_std": 0.0, "grad_norm": 0.10986942428119216, "kl": 0.0005559921264648438, "learning_rate": 1.6170212765957448e-07, "loss": 0.0985, "num_tokens": 56480542.0, "reward": 0.5463169813156128, "reward_std": 0.3517903685569763, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1378348171710968, "rewards/tag_count_reward/std": 0.18652118742465973, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1352.712158203125, "completions/mean_terminated_length": 970.1834106445312, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.01662138404986415, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11358275964744895, "kl": 0.0005674362182617188, "learning_rate": 1.6382978723404254e-07, "loss": 0.1028, "num_tokens": 57151981.0, "reward": 0.5931919813156128, "reward_std": 0.3596561551094055, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1690848171710968, "rewards/tag_count_reward/std": 0.17935582995414734, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1519.7723388671875, "completions/mean_terminated_length": 904.7825927734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.016834478717170104, "frac_reward_zero_std": 0.0, "grad_norm": 0.10108913185978581, "kl": 0.0005426406860351562, "learning_rate": 1.6595744680851062e-07, "loss": 0.1016, "num_tokens": 57900567.0, "reward": 0.5290178656578064, "reward_std": 0.3699549734592438, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1428571492433548, "rewards/tag_count_reward/std": 0.17686142027378082, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1466.3035888671875, "completions/mean_terminated_length": 943.7626953125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.017047573384476055, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.27266530804882455, "kl": 0.000659942626953125, "learning_rate": 1.6808510638297872e-07, "loss": 0.0881, "num_tokens": 58624063.0, "reward": 0.4508928656578064, "reward_std": 0.3456571400165558, "rewards/accuracy_reward/mean": 0.3102678656578064, "rewards/accuracy_reward/std": 0.46312034130096436, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.18555572628974915, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1604.509033203125, "completions/mean_terminated_length": 1029.107666015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.017260668051782006, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.27194617401617815, "kl": 0.0006427764892578125, "learning_rate": 1.702127659574468e-07, "loss": 0.0498, "num_tokens": 59416195.0, "reward": 0.5368303656578064, "reward_std": 0.37965935468673706, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1238839253783226, "rewards/tag_count_reward/std": 0.16039270162582397, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1410.837158203125, "completions/mean_terminated_length": 915.2659301757812, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.017473762719087956, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12917745958808036, "kl": 0.00067138671875, "learning_rate": 1.7234042553191488e-07, "loss": 0.0966, "num_tokens": 60120074.0, "reward": 0.621651828289032, "reward_std": 0.3663535714149475, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1551339328289032, "rewards/tag_count_reward/std": 0.17598041892051697, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1607.3795166015625, "completions/mean_terminated_length": 1025.21240234375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.017686857386393907, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12240175494287042, "kl": 0.0006380081176757812, "learning_rate": 1.7446808510638299e-07, "loss": 0.0692, "num_tokens": 60908196.0, "reward": 0.4079241156578064, "reward_std": 0.29820477962493896, "rewards/accuracy_reward/mean": 0.2879464328289032, "rewards/accuracy_reward/std": 0.4533121883869171, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1199776753783226, "rewards/tag_count_reward/std": 0.15767961740493774, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1484.4844970703125, "completions/mean_terminated_length": 879.2268676757812, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.017899952053699857, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09501206879675293, "kl": 0.0005626678466796875, "learning_rate": 1.7659574468085106e-07, "loss": 0.1089, "num_tokens": 61636637.0, "reward": 0.459263414144516, "reward_std": 0.40043362975120544, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1199776753783226, "rewards/tag_count_reward/std": 0.18311679363250732, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1551.47998046875, "completions/mean_terminated_length": 871.0634765625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.018113046721005808, "frac_reward_zero_std": 0.0, "grad_norm": 0.1024884959406532, "kl": 0.0006132125854492188, "learning_rate": 1.7872340425531914e-07, "loss": 0.1096, "num_tokens": 62402452.0, "reward": 0.4324777126312256, "reward_std": 0.3555998206138611, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46031373739242554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.12890625, "rewards/tag_count_reward/std": 0.16966979205608368, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1443.3817138671875, "completions/mean_terminated_length": 919.3792114257812, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.01832614138831176, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10653421298342539, "kl": 0.0006618499755859375, "learning_rate": 1.8085106382978725e-07, "loss": 0.0725, "num_tokens": 63116687.0, "reward": 0.6194196939468384, "reward_std": 0.3753754496574402, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1506696492433548, "rewards/tag_count_reward/std": 0.1718747764825821, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1498.6785888671875, "completions/mean_terminated_length": 959.0796508789062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.01853923605561771, "frac_reward_zero_std": 0.0, "grad_norm": 0.0995652245577469, "kl": 0.0005559921264648438, "learning_rate": 1.829787234042553e-07, "loss": 0.1143, "num_tokens": 63859839.0, "reward": 0.4760044813156128, "reward_std": 0.4069821238517761, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1456473171710968, "rewards/tag_count_reward/std": 0.17969655990600586, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1625.6295166015625, "completions/mean_terminated_length": 1101.8900146484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.01875233072292366, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09725143224765981, "kl": 0.0006198883056640625, "learning_rate": 1.8510638297872338e-07, "loss": 0.0878, "num_tokens": 64654553.0, "reward": 0.5206473469734192, "reward_std": 0.332084059715271, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.17117878794670105, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1465.9754638671875, "completions/mean_terminated_length": 904.372802734375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.01896542539022961, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.09620892501706234, "kl": 0.0006999969482421875, "learning_rate": 1.8723404255319146e-07, "loss": 0.056, "num_tokens": 65378878.0, "reward": 0.5089285969734192, "reward_std": 0.3440691828727722, "rewards/accuracy_reward/mean": 0.38657405972480774, "rewards/accuracy_reward/std": 0.4875292479991913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1361607164144516, "rewards/tag_count_reward/std": 0.16431809961795807, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1402.18310546875, "completions/mean_terminated_length": 857.3579711914062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.01917852005753556, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10804353703288051, "kl": 0.0005998611450195312, "learning_rate": 1.8936170212765957e-07, "loss": 0.1313, "num_tokens": 66075712.0, "reward": 0.59765625, "reward_std": 0.33677539229393005, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1534598171710968, "rewards/tag_count_reward/std": 0.1722475290298462, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1543.1629638671875, "completions/mean_terminated_length": 917.1649780273438, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.01939161472484151, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09515397637834716, "kl": 0.000644683837890625, "learning_rate": 1.9148936170212765e-07, "loss": 0.0675, "num_tokens": 66839065.0, "reward": 0.420200914144516, "reward_std": 0.3352234661579132, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45011183619499207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1389508992433548, "rewards/tag_count_reward/std": 0.19234690070152283, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1482.5848388671875, "completions/mean_terminated_length": 917.169677734375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.019604709392147462, "frac_reward_zero_std": 0.0, "grad_norm": 0.1068141667938709, "kl": 0.0005855560302734375, "learning_rate": 1.9361702127659573e-07, "loss": 0.0812, "num_tokens": 67571631.0, "reward": 0.5496652126312256, "reward_std": 0.3237675428390503, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1456473171710968, "rewards/tag_count_reward/std": 0.1789167821407318, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1530.540283203125, "completions/mean_terminated_length": 984.5963134765625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.019817804059453412, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09780008866627878, "kl": 0.0006361007690429688, "learning_rate": 1.9574468085106383e-07, "loss": 0.0536, "num_tokens": 68336577.0, "reward": 0.4927455484867096, "reward_std": 0.34423699975013733, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.12890625, "rewards/tag_count_reward/std": 0.16034892201423645, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1468.466552734375, "completions/mean_terminated_length": 961.6777954101562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.020030898726759363, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10978206593533296, "kl": 0.0005826950073242188, "learning_rate": 1.978723404255319e-07, "loss": 0.0805, "num_tokens": 69068178.0, "reward": 0.555245578289032, "reward_std": 0.3736456334590912, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1400669664144516, "rewards/tag_count_reward/std": 0.16821405291557312, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1644.435302734375, "completions/mean_terminated_length": 1008.936767578125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.020243993394065313, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1545924523125092, "kl": 0.0007457733154296875, "learning_rate": 2e-07, "loss": 0.0493, "num_tokens": 69878165.0, "reward": 0.3917410969734192, "reward_std": 0.340265154838562, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44215917587280273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1261160671710968, "rewards/tag_count_reward/std": 0.18164943158626556, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1456.8907470703125, "completions/mean_terminated_length": 949.17431640625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.020457088061371264, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.30280277963362745, "kl": 0.0006666183471679688, "learning_rate": 2.0212765957446807e-07, "loss": 0.1121, "num_tokens": 70603604.0, "reward": 0.5251116156578064, "reward_std": 0.38898965716362, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1501116007566452, "rewards/tag_count_reward/std": 0.17834369838237762, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1396.743408203125, "completions/mean_terminated_length": 975.3419189453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.020670182728677215, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09653671528525909, "kl": 0.0006580352783203125, "learning_rate": 2.0425531914893615e-07, "loss": 0.0432, "num_tokens": 71300305.0, "reward": 0.6110491156578064, "reward_std": 0.3495582640171051, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1467633992433548, "rewards/tag_count_reward/std": 0.16747771203517914, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1491.6116943359375, "completions/mean_terminated_length": 940.1688842773438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.020883277395983165, "frac_reward_zero_std": 0.0, "grad_norm": 0.8778560965538851, "kl": 0.0006694793701171875, "learning_rate": 2.0638297872340423e-07, "loss": 0.074, "num_tokens": 72035571.0, "reward": 0.5223214626312256, "reward_std": 0.3277195692062378, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1607142835855484, "rewards/tag_count_reward/std": 0.19532322883605957, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1413.325927734375, "completions/mean_terminated_length": 761.4208374023438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.021096372063289116, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11767497920125451, "kl": 0.000736236572265625, "learning_rate": 2.0851063829787233e-07, "loss": 0.0494, "num_tokens": 72731221.0, "reward": 0.5178571939468384, "reward_std": 0.34111085534095764, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1339285671710968, "rewards/tag_count_reward/std": 0.1719365119934082, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1466.2098388671875, "completions/mean_terminated_length": 884.419677734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.021309466730595066, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1027935139762827, "kl": 0.0005636215209960938, "learning_rate": 2.1063829787234041e-07, "loss": 0.0905, "num_tokens": 73457123.0, "reward": 0.4642857313156128, "reward_std": 0.3221930265426636, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839529275894165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.1754866987466812, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1335.977783203125, "completions/mean_terminated_length": 951.8281860351562, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.021522561397901017, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.29290425659683816, "kl": 0.000652313232421875, "learning_rate": 2.127659574468085e-07, "loss": 0.0692, "num_tokens": 74112921.0, "reward": 0.6774553656578064, "reward_std": 0.36400285363197327, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1618303507566452, "rewards/tag_count_reward/std": 0.1681741625070572, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1609.1407470703125, "completions/mean_terminated_length": 911.5317993164062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.021735656065206967, "frac_reward_zero_std": 0.0, "grad_norm": 0.10055986763775289, "kl": 0.0006399154663085938, "learning_rate": 2.148936170212766e-07, "loss": 0.0764, "num_tokens": 74907400.0, "reward": 0.3805803656578064, "reward_std": 0.29630815982818604, "rewards/accuracy_reward/mean": 0.2566964328289032, "rewards/accuracy_reward/std": 0.4372987747192383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1238839253783226, "rewards/tag_count_reward/std": 0.169711172580719, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1511.0379638671875, "completions/mean_terminated_length": 874.5414428710938, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.021948750732512918, "frac_reward_zero_std": 0.0, "grad_norm": 0.2765706723648975, "kl": 0.000591278076171875, "learning_rate": 2.1702127659574468e-07, "loss": 0.1189, "num_tokens": 75653033.0, "reward": 0.4765625298023224, "reward_std": 0.370633989572525, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47086748480796814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1462053507566452, "rewards/tag_count_reward/std": 0.19240935146808624, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1562.7523193359375, "completions/mean_terminated_length": 1050.7935791015625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.02216184539981887, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.22219206919252477, "kl": 0.0006437301635742188, "learning_rate": 2.1914893617021276e-07, "loss": 0.0435, "num_tokens": 76422474.0, "reward": 0.5775669813156128, "reward_std": 0.4197610020637512, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1378348171710968, "rewards/tag_count_reward/std": 0.16248351335525513, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1497.3126220703125, "completions/mean_terminated_length": 832.6896362304688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.02237494006712482, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.08405243328749457, "kl": 0.00061798095703125, "learning_rate": 2.2127659574468084e-07, "loss": 0.0677, "num_tokens": 77162566.0, "reward": 0.4793527126312256, "reward_std": 0.3031631410121918, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1110491082072258, "rewards/tag_count_reward/std": 0.1748300939798355, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1625.0157470703125, "completions/mean_terminated_length": 1119.0931396484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02258803473443077, "frac_reward_zero_std": 0.0, "grad_norm": 0.09763429596861473, "kl": 0.0006494522094726562, "learning_rate": 2.2340425531914892e-07, "loss": 0.0522, "num_tokens": 77973837.0, "reward": 0.504464328289032, "reward_std": 0.40397918224334717, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1294642835855484, "rewards/tag_count_reward/std": 0.15136271715164185, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1463.2723388671875, "completions/mean_terminated_length": 938.0084838867188, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.02280112940173672, "frac_reward_zero_std": 0.0, "grad_norm": 0.10807042921158809, "kl": 0.0006504058837890625, "learning_rate": 2.25531914893617e-07, "loss": 0.0468, "num_tokens": 78699927.0, "reward": 0.5256696939468384, "reward_std": 0.394024521112442, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1484375, "rewards/tag_count_reward/std": 0.18397125601768494, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1410.4398193359375, "completions/mean_terminated_length": 872.5802001953125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02301422406904267, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11091067162003142, "kl": 0.0005054473876953125, "learning_rate": 2.276595744680851e-07, "loss": 0.0834, "num_tokens": 79398924.0, "reward": 0.570870578289032, "reward_std": 0.35215818881988525, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1623883992433548, "rewards/tag_count_reward/std": 0.17853958904743195, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1504.743408203125, "completions/mean_terminated_length": 994.4112548828125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.02322731873634862, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09505077221753427, "kl": 0.0005846023559570312, "learning_rate": 2.2978723404255318e-07, "loss": 0.06, "num_tokens": 80137577.0, "reward": 0.5831473469734192, "reward_std": 0.39024558663368225, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.1768040508031845, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1589.3126220703125, "completions/mean_terminated_length": 1083.2489013671875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.023440413403654575, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09416312435280728, "kl": 0.0006666183471679688, "learning_rate": 2.3191489361702126e-07, "loss": 0.068, "num_tokens": 80924645.0, "reward": 0.5418527126312256, "reward_std": 0.37803032994270325, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1489955335855484, "rewards/tag_count_reward/std": 0.18237218260765076, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1569.274658203125, "completions/mean_terminated_length": 793.7953491210938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.023653508070960526, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09445706322437208, "kl": 0.0006265640258789062, "learning_rate": 2.3404255319148937e-07, "loss": 0.0926, "num_tokens": 81707072.0, "reward": 0.291294664144516, "reward_std": 0.33396586775779724, "rewards/accuracy_reward/mean": 0.1852678507566452, "rewards/accuracy_reward/std": 0.38894903659820557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1060267835855484, "rewards/tag_count_reward/std": 0.16947561502456665, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1318.1629638671875, "completions/mean_terminated_length": 713.4407958984375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.023866602738266476, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10026729129208353, "kl": 0.0005350112915039062, "learning_rate": 2.3617021276595745e-07, "loss": 0.0808, "num_tokens": 82364361.0, "reward": 0.55078125, "reward_std": 0.3883152902126312, "rewards/accuracy_reward/mean": 0.4097222089767456, "rewards/accuracy_reward/std": 0.49235257506370544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1556919664144516, "rewards/tag_count_reward/std": 0.19330507516860962, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1538.6429443359375, "completions/mean_terminated_length": 859.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.024079697405572427, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3987459032283757, "kl": 0.0007219314575195312, "learning_rate": 2.3829787234042553e-07, "loss": 0.1144, "num_tokens": 83123113.0, "reward": 0.4140625298023224, "reward_std": 0.3538362681865692, "rewards/accuracy_reward/mean": 0.2834821343421936, "rewards/accuracy_reward/std": 0.4511922299861908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1305803507566452, "rewards/tag_count_reward/std": 0.19349630177021027, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1538.2410888671875, "completions/mean_terminated_length": 933.990234375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.024292792072878377, "frac_reward_zero_std": 0.0, "grad_norm": 0.11072220993669099, "kl": 0.0007066726684570312, "learning_rate": 2.404255319148936e-07, "loss": 0.1026, "num_tokens": 83885317.0, "reward": 0.4910714626312256, "reward_std": 0.3769613802433014, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1450892835855484, "rewards/tag_count_reward/std": 0.18360786139965057, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1453.040283203125, "completions/mean_terminated_length": 884.0611572265625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.024505886740184328, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10832449307063403, "kl": 0.0007677078247070312, "learning_rate": 2.425531914893617e-07, "loss": 0.0841, "num_tokens": 84601255.0, "reward": 0.5675223469734192, "reward_std": 0.3686137795448303, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1478794664144516, "rewards/tag_count_reward/std": 0.1746872216463089, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1678.15185546875, "completions/mean_terminated_length": 897.3611450195312, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.02471898140749028, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10803386197177677, "kl": 0.00069427490234375, "learning_rate": 2.4468085106382976e-07, "loss": 0.0946, "num_tokens": 85424011.0, "reward": 0.322544664144516, "reward_std": 0.28670406341552734, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.4138607978820801, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1037946417927742, "rewards/tag_count_reward/std": 0.16417749226093292, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1465.325927734375, "completions/mean_terminated_length": 969.33056640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.02493207607479623, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12501492248088594, "kl": 0.00067901611328125, "learning_rate": 2.4680851063829784e-07, "loss": 0.0718, "num_tokens": 86160477.0, "reward": 0.547433078289032, "reward_std": 0.4016090929508209, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16796875, "rewards/tag_count_reward/std": 0.18947620689868927, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1441.227783203125, "completions/mean_terminated_length": 886.3162841796875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.02514517074210218, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1035698561749098, "kl": 0.0006999969482421875, "learning_rate": 2.489361702127659e-07, "loss": 0.0778, "num_tokens": 86878835.0, "reward": 0.4960937798023224, "reward_std": 0.3432289958000183, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.1860520839691162, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1485.966552734375, "completions/mean_terminated_length": 1032.713623046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02535826540940813, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12158033555966147, "kl": 0.0007266998291015625, "learning_rate": 2.51063829787234e-07, "loss": 0.0667, "num_tokens": 87610900.0, "reward": 0.6183035969734192, "reward_std": 0.34593039751052856, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1450892835855484, "rewards/tag_count_reward/std": 0.16851899027824402, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1660.310302734375, "completions/mean_terminated_length": 1109.1622314453125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.02557136007671408, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.08667027420443743, "kl": 0.0006589889526367188, "learning_rate": 2.531914893617021e-07, "loss": 0.0783, "num_tokens": 88430783.0, "reward": 0.4860491156578064, "reward_std": 0.3413888216018677, "rewards/accuracy_reward/mean": 0.3325892984867096, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1534598171710968, "rewards/tag_count_reward/std": 0.201433002948761, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1556.450927734375, "completions/mean_terminated_length": 952.407958984375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02578445474402003, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.40394700910506426, "kl": 0.0006160736083984375, "learning_rate": 2.5531914893617016e-07, "loss": 0.121, "num_tokens": 89201337.0, "reward": 0.4598214626312256, "reward_std": 0.36382588744163513, "rewards/accuracy_reward/mean": 0.3214285671710968, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1383928507566452, "rewards/tag_count_reward/std": 0.19669893383979797, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1499.9107666015625, "completions/mean_terminated_length": 844.3529663085938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.025997549411325982, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.21764557937746304, "kl": 0.0005817413330078125, "learning_rate": 2.574468085106383e-07, "loss": 0.0724, "num_tokens": 89940417.0, "reward": 0.500558078289032, "reward_std": 0.3537725806236267, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1411830335855484, "rewards/tag_count_reward/std": 0.17302851378917694, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1537.9576416015625, "completions/mean_terminated_length": 938.7815551757812, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.026210644078631933, "frac_reward_zero_std": 0.0, "grad_norm": 0.0998604700188912, "kl": 0.0006914138793945312, "learning_rate": 2.5957446808510637e-07, "loss": 0.0948, "num_tokens": 90706206.0, "reward": 0.4224330484867096, "reward_std": 0.38441112637519836, "rewards/accuracy_reward/mean": 0.2790178656578064, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.18148697912693024, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1528.634033203125, "completions/mean_terminated_length": 934.7176513671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.026423738745937883, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.8214136488274478, "kl": 0.0007534027099609375, "learning_rate": 2.6170212765957445e-07, "loss": 0.1294, "num_tokens": 91453514.0, "reward": 0.5412946939468384, "reward_std": 0.3938583731651306, "rewards/accuracy_reward/mean": 0.40046295523643494, "rewards/accuracy_reward/std": 0.49056029319763184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1551339328289032, "rewards/tag_count_reward/std": 0.18974366784095764, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1619.35498046875, "completions/mean_terminated_length": 975.18994140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.026636833413243834, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09679444024126113, "kl": 0.0006990432739257812, "learning_rate": 2.6382978723404253e-07, "loss": 0.0409, "num_tokens": 92242329.0, "reward": 0.396763414144516, "reward_std": 0.31771764159202576, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45011183619499207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1155133917927742, "rewards/tag_count_reward/std": 0.16442348062992096, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1606.602783203125, "completions/mean_terminated_length": 930.7909545898438, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.026849928080549784, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11909391566085942, "kl": 0.0006961822509765625, "learning_rate": 2.659574468085106e-07, "loss": 0.0649, "num_tokens": 93034135.0, "reward": 0.4140625298023224, "reward_std": 0.3400106132030487, "rewards/accuracy_reward/mean": 0.2901785671710968, "rewards/accuracy_reward/std": 0.4543520212173462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1238839253783226, "rewards/tag_count_reward/std": 0.16126208007335663, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1516.634033203125, "completions/mean_terminated_length": 827.2205200195312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.027063022747855735, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.0959436264047987, "kl": 0.0006580352783203125, "learning_rate": 2.680851063829787e-07, "loss": 0.0871, "num_tokens": 93788867.0, "reward": 0.4285714626312256, "reward_std": 0.39995086193084717, "rewards/accuracy_reward/mean": 0.3058035671710968, "rewards/accuracy_reward/std": 0.461262047290802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1227678582072258, "rewards/tag_count_reward/std": 0.16552923619747162, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1601.821533203125, "completions/mean_terminated_length": 865.2307739257812, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.027276117415161685, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09881685227109833, "kl": 0.0006542205810546875, "learning_rate": 2.702127659574468e-07, "loss": 0.116, "num_tokens": 94576019.0, "reward": 0.3710937798023224, "reward_std": 0.3424123525619507, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.43349677324295044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.12109375, "rewards/tag_count_reward/std": 0.17613907158374786, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1371.2523193359375, "completions/mean_terminated_length": 868.2996215820312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.027489212082467636, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12308943825185102, "kl": 0.0006923675537109375, "learning_rate": 2.723404255319149e-07, "loss": 0.0807, "num_tokens": 95257508.0, "reward": 0.6233259439468384, "reward_std": 0.39112555980682373, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1813616007566452, "rewards/tag_count_reward/std": 0.19688211381435394, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1442.232177734375, "completions/mean_terminated_length": 868.0695190429688, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.027702306749773586, "frac_reward_zero_std": 0.0, "grad_norm": 0.10037850525564837, "kl": 0.000583648681640625, "learning_rate": 2.74468085106383e-07, "loss": 0.109, "num_tokens": 95967964.0, "reward": 0.5206473469734192, "reward_std": 0.3918469548225403, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.1434151828289032, "rewards/tag_count_reward/std": 0.16953729093074799, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1480.88623046875, "completions/mean_terminated_length": 802.5735473632812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.027915401417079537, "frac_reward_zero_std": 0.0, "grad_norm": 0.10545902003012357, "kl": 0.0006513595581054688, "learning_rate": 2.7659574468085106e-07, "loss": 0.1391, "num_tokens": 96700249.0, "reward": 0.5078125, "reward_std": 0.376852810382843, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1350446492433548, "rewards/tag_count_reward/std": 0.16858935356140137, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1606.58935546875, "completions/mean_terminated_length": 967.387939453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.028128496084385488, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.5570782586541217, "kl": 0.0007734298706054688, "learning_rate": 2.7872340425531914e-07, "loss": 0.0926, "num_tokens": 97496401.0, "reward": 0.3152901828289032, "reward_std": 0.3668878674507141, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3907487094402313, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1277901828289032, "rewards/tag_count_reward/std": 0.1721460223197937, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1222.1004638671875, "completions/mean_terminated_length": 826.8680419921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.028341590751691438, "frac_reward_zero_std": 0.0, "grad_norm": 0.3014704904699505, "kl": 0.0007181167602539062, "learning_rate": 2.808510638297872e-07, "loss": 0.1489, "num_tokens": 98104638.0, "reward": 0.6679688096046448, "reward_std": 0.37830984592437744, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1746651828289032, "rewards/tag_count_reward/std": 0.17229825258255005, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1480.9532470703125, "completions/mean_terminated_length": 966.991455078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.02855468541899739, "frac_reward_zero_std": 0.0, "grad_norm": 0.0991900877404499, "kl": 0.00064849853515625, "learning_rate": 2.8297872340425535e-07, "loss": 0.0876, "num_tokens": 98839273.0, "reward": 0.5306919813156128, "reward_std": 0.3562542796134949, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1489955335855484, "rewards/tag_count_reward/std": 0.1737341433763504, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1451.88623046875, "completions/mean_terminated_length": 979.7640380859375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02876778008630334, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11305584827542739, "kl": 0.000728607177734375, "learning_rate": 2.8510638297872343e-07, "loss": 0.0675, "num_tokens": 99555910.0, "reward": 0.6707589626312256, "reward_std": 0.41780179738998413, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1662946492433548, "rewards/tag_count_reward/std": 0.17288866639137268, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1470.16748046875, "completions/mean_terminated_length": 999.9473876953125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.02898087475360929, "frac_reward_zero_std": 0.0, "grad_norm": 0.10307618624591357, "kl": 0.0007343292236328125, "learning_rate": 2.872340425531915e-07, "loss": 0.1195, "num_tokens": 100281729.0, "reward": 0.5686384439468384, "reward_std": 0.3552924692630768, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.18678203225135803, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1476.3795166015625, "completions/mean_terminated_length": 851.33642578125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.02919396942091524, "frac_reward_zero_std": 0.0, "grad_norm": 0.11018190902562928, "kl": 0.0007200241088867188, "learning_rate": 2.8936170212765954e-07, "loss": 0.078, "num_tokens": 101012091.0, "reward": 0.4799107313156128, "reward_std": 0.3507802188396454, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1383928507566452, "rewards/tag_count_reward/std": 0.1700088381767273, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1371.3907470703125, "completions/mean_terminated_length": 949.7355346679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.02940706408822119, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145734046911678, "kl": 0.0007352828979492188, "learning_rate": 2.914893617021276e-07, "loss": 0.0348, "num_tokens": 101693306.0, "reward": 0.6774553656578064, "reward_std": 0.38402020931243896, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1841517835855484, "rewards/tag_count_reward/std": 0.19390879571437836, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1589.919677734375, "completions/mean_terminated_length": 944.6666870117188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.02962015875552714, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10102389689442862, "kl": 0.0006847381591796875, "learning_rate": 2.936170212765957e-07, "loss": 0.0882, "num_tokens": 102481718.0, "reward": 0.4135044813156128, "reward_std": 0.35761624574661255, "rewards/accuracy_reward/mean": 0.2834821343421936, "rewards/accuracy_reward/std": 0.4511922299861908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1300223171710968, "rewards/tag_count_reward/std": 0.1671494096517563, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1607.1607666015625, "completions/mean_terminated_length": 1055.5577392578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.029833253422833092, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13531349240595333, "kl": 0.0006647109985351562, "learning_rate": 2.957446808510638e-07, "loss": 0.0794, "num_tokens": 103270814.0, "reward": 0.4375000298023224, "reward_std": 0.3748447597026825, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45739173889160156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.18328121304512024, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1502.3751220703125, "completions/mean_terminated_length": 931.8355712890625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.030046348090139043, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10403643046517459, "kl": 0.0007505416870117188, "learning_rate": 2.978723404255319e-07, "loss": 0.0955, "num_tokens": 104012902.0, "reward": 0.4955357313156128, "reward_std": 0.3992388844490051, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1517857164144516, "rewards/tag_count_reward/std": 0.17332497239112854, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1573.8304443359375, "completions/mean_terminated_length": 1006.6863403320312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.030259442757444997, "frac_reward_zero_std": 0.0, "grad_norm": 0.12303349814944814, "kl": 0.0007162094116210938, "learning_rate": 3e-07, "loss": 0.0961, "num_tokens": 104791098.0, "reward": 0.4771205484867096, "reward_std": 0.3802442252635956, "rewards/accuracy_reward/mean": 0.3191964328289032, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1579241007566452, "rewards/tag_count_reward/std": 0.19438061118125916, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1450.3348388671875, "completions/mean_terminated_length": 950.6474609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.030472537424750947, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1059472942565495, "kl": 0.0007429122924804688, "learning_rate": 3.0212765957446807e-07, "loss": 0.0753, "num_tokens": 105511520.0, "reward": 0.6356027126312256, "reward_std": 0.3977864980697632, "rewards/accuracy_reward/mean": 0.47685185074806213, "rewards/accuracy_reward/std": 0.5000429749488831, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.19839808344841003, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1413.8438720703125, "completions/mean_terminated_length": 995.7703247070312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.030685632092056898, "frac_reward_zero_std": 0.0, "grad_norm": 0.10889084841332757, "kl": 0.000713348388671875, "learning_rate": 3.0425531914893615e-07, "loss": 0.0773, "num_tokens": 106217258.0, "reward": 0.6462053656578064, "reward_std": 0.3555229902267456, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1841517835855484, "rewards/tag_count_reward/std": 0.18277306854724884, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1514.6563720703125, "completions/mean_terminated_length": 971.7026977539062, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.03089872675936285, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09982227377646177, "kl": 0.0007123947143554688, "learning_rate": 3.063829787234042e-07, "loss": 0.088, "num_tokens": 106963776.0, "reward": 0.535714328289032, "reward_std": 0.3437630534172058, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1495535671710968, "rewards/tag_count_reward/std": 0.17842154204845428, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1284.5826416015625, "completions/mean_terminated_length": 776.5836181640625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0311118214266688, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.5199829146786306, "kl": 0.0014495849609375, "learning_rate": 3.085106382978723e-07, "loss": 0.1156, "num_tokens": 107604533.0, "reward": 0.5636160969734192, "reward_std": 0.3381498456001282, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1863839328289032, "rewards/tag_count_reward/std": 0.18733346462249756, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1435.5023193359375, "completions/mean_terminated_length": 854.9608154296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.031324916093974746, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15615433434201076, "kl": 0.00084686279296875, "learning_rate": 3.1063829787234044e-07, "loss": 0.1369, "num_tokens": 108314566.0, "reward": 0.5217634439468384, "reward_std": 0.3766101896762848, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1512276828289032, "rewards/tag_count_reward/std": 0.1917879283428192, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1588.509033203125, "completions/mean_terminated_length": 1033.95068359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.031538010761280696, "frac_reward_zero_std": 0.0, "grad_norm": 0.09700597541344583, "kl": 0.0007734298706054688, "learning_rate": 3.127659574468085e-07, "loss": 0.1141, "num_tokens": 109097370.0, "reward": 0.5128348469734192, "reward_std": 0.34065452218055725, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.4834881126880646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1422991007566452, "rewards/tag_count_reward/std": 0.17372694611549377, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1526.5023193359375, "completions/mean_terminated_length": 1049.5770263671875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03175110542858665, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10050245436910464, "kl": 0.0007734298706054688, "learning_rate": 3.148936170212766e-07, "loss": 0.109, "num_tokens": 109847387.0, "reward": 0.6305803656578064, "reward_std": 0.3712863624095917, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1662946492433548, "rewards/tag_count_reward/std": 0.1776748150587082, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1575.950927734375, "completions/mean_terminated_length": 846.4204711914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0319642000958926, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12137704010035566, "kl": 0.0007715225219726562, "learning_rate": 3.170212765957447e-07, "loss": 0.072, "num_tokens": 110629173.0, "reward": 0.3431919813156128, "reward_std": 0.31204429268836975, "rewards/accuracy_reward/mean": 0.2075892835855484, "rewards/accuracy_reward/std": 0.4060344398021698, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1356026828289032, "rewards/tag_count_reward/std": 0.18889549374580383, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1409.247802734375, "completions/mean_terminated_length": 916.9288940429688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.03217729476319855, "frac_reward_zero_std": 0.0, "grad_norm": 0.1022040412979308, "kl": 0.0007123947143554688, "learning_rate": 3.1914893617021275e-07, "loss": 0.1035, "num_tokens": 111330628.0, "reward": 0.4921875298023224, "reward_std": 0.32500773668289185, "rewards/accuracy_reward/mean": 0.3102678656578064, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1819196492433548, "rewards/tag_count_reward/std": 0.20370477437973022, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1384.4107666015625, "completions/mean_terminated_length": 824.5925903320312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0323903894305045, "frac_reward_zero_std": 0.0, "grad_norm": 0.10801845039896064, "kl": 0.0007963180541992188, "learning_rate": 3.2127659574468083e-07, "loss": 0.1319, "num_tokens": 112022108.0, "reward": 0.520089328289032, "reward_std": 0.3736415505409241, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.19075748324394226, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1509.2545166015625, "completions/mean_terminated_length": 1016.5556030273438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.03260348409781045, "frac_reward_zero_std": 0.0, "grad_norm": 0.10586276827109281, "kl": 0.000843048095703125, "learning_rate": 3.2340425531914897e-07, "loss": 0.0882, "num_tokens": 112765406.0, "reward": 0.5619419813156128, "reward_std": 0.3749977648258209, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1668526828289032, "rewards/tag_count_reward/std": 0.18220780789852142, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1344.02685546875, "completions/mean_terminated_length": 811.2157592773438, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.0328165787651164, "frac_reward_zero_std": 0.0, "grad_norm": 0.12354788082657898, "kl": 0.0008745193481445312, "learning_rate": 3.2553191489361704e-07, "loss": 0.1751, "num_tokens": 113434410.0, "reward": 0.5680803656578064, "reward_std": 0.36260661482810974, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1863839328289032, "rewards/tag_count_reward/std": 0.18733346462249756, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1332.165283203125, "completions/mean_terminated_length": 790.3765258789062, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.03302967343242235, "frac_reward_zero_std": 0.0, "grad_norm": 0.10072773853337534, "kl": 0.0007638931274414062, "learning_rate": 3.2765957446808507e-07, "loss": 0.1433, "num_tokens": 114097444.0, "reward": 0.5267857313156128, "reward_std": 0.3183961808681488, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1540178507566452, "rewards/tag_count_reward/std": 0.18918029963970184, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1328.790283203125, "completions/mean_terminated_length": 808.7461547851562, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0332427680997283, "frac_reward_zero_std": 0.0, "grad_norm": 0.11892638203854576, "kl": 0.000904083251953125, "learning_rate": 3.2978723404255315e-07, "loss": 0.0896, "num_tokens": 114762710.0, "reward": 0.582589328289032, "reward_std": 0.35534539818763733, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1763392835855484, "rewards/tag_count_reward/std": 0.19754758477210999, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1416.794677734375, "completions/mean_terminated_length": 844.6808471679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03345586276703426, "frac_reward_zero_std": 0.0, "grad_norm": 0.1094296697067676, "kl": 0.000827789306640625, "learning_rate": 3.3191489361702123e-07, "loss": 0.1421, "num_tokens": 115465082.0, "reward": 0.486607164144516, "reward_std": 0.4065491855144501, "rewards/accuracy_reward/mean": 0.3214285671710968, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1651785671710968, "rewards/tag_count_reward/std": 0.19300860166549683, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1278.243408203125, "completions/mean_terminated_length": 871.0341186523438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.03366895743434021, "frac_reward_zero_std": 0.0, "grad_norm": 0.11140758435887073, "kl": 0.000904083251953125, "learning_rate": 3.340425531914893e-07, "loss": 0.0945, "num_tokens": 116102375.0, "reward": 0.6266741156578064, "reward_std": 0.39421525597572327, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2003348171710968, "rewards/tag_count_reward/std": 0.18820029497146606, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1166.5826416015625, "completions/mean_terminated_length": 778.3054809570312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.03388205210164616, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.8275997814768032, "kl": 0.0020885467529296875, "learning_rate": 3.3617021276595744e-07, "loss": 0.1255, "num_tokens": 116691260.0, "reward": 0.7327009439468384, "reward_std": 0.3838199973106384, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2081473171710968, "rewards/tag_count_reward/std": 0.1966029405593872, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1379.5357666015625, "completions/mean_terminated_length": 845.3011474609375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.03409514676895211, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10448353642895948, "kl": 0.0009469985961914062, "learning_rate": 3.382978723404255e-07, "loss": 0.1081, "num_tokens": 117378204.0, "reward": 0.5714285969734192, "reward_std": 0.37186500430107117, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1919642835855484, "rewards/tag_count_reward/std": 0.2186294049024582, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1436.43310546875, "completions/mean_terminated_length": 901.6317749023438, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.03430824143625806, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10995124087304915, "kl": 0.0010004043579101562, "learning_rate": 3.404255319148936e-07, "loss": 0.0847, "num_tokens": 118091838.0, "reward": 0.59765625, "reward_std": 0.4327698051929474, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.2025824636220932, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1471.9754638671875, "completions/mean_terminated_length": 807.331787109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03452133610356401, "frac_reward_zero_std": 0.0, "grad_norm": 0.1087052374101123, "kl": 0.0008993148803710938, "learning_rate": 3.425531914893617e-07, "loss": 0.089, "num_tokens": 118829891.0, "reward": 0.4185267984867096, "reward_std": 0.34510934352874756, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44215917587280273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1529017835855484, "rewards/tag_count_reward/std": 0.1893485188484192, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1558.904052734375, "completions/mean_terminated_length": 1056.5294189453125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.03473443077086996, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.0951884165325215, "kl": 0.0008544921875, "learning_rate": 3.4468085106382976e-07, "loss": 0.0621, "num_tokens": 119604184.0, "reward": 0.598214328289032, "reward_std": 0.39945685863494873, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1696428507566452, "rewards/tag_count_reward/std": 0.17999590933322906, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1464.060302734375, "completions/mean_terminated_length": 966.987548828125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.03494752543817591, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10062776057654353, "kl": 0.0009670257568359375, "learning_rate": 3.4680851063829784e-07, "loss": 0.077, "num_tokens": 120323235.0, "reward": 0.5703125, "reward_std": 0.34405753016471863, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1551339328289032, "rewards/tag_count_reward/std": 0.1686781793832779, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1355.7701416015625, "completions/mean_terminated_length": 761.19921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03516062010548186, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 1.5572030012754672, "kl": 0.004134178161621094, "learning_rate": 3.4893617021276597e-07, "loss": 0.1065, "num_tokens": 121002828.0, "reward": 0.5150669813156128, "reward_std": 0.3479662537574768, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1668526828289032, "rewards/tag_count_reward/std": 0.1852518767118454, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1444.01123046875, "completions/mean_terminated_length": 896.56591796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03537371477278781, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10302955056637572, "kl": 0.0010671615600585938, "learning_rate": 3.5106382978723405e-07, "loss": 0.0967, "num_tokens": 121710881.0, "reward": 0.5379464626312256, "reward_std": 0.37933671474456787, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1674107164144516, "rewards/tag_count_reward/std": 0.19397637248039246, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1614.5179443359375, "completions/mean_terminated_length": 986.7977905273438, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.035586809440093764, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10032984664205931, "kl": 0.0009374618530273438, "learning_rate": 3.5319148936170213e-07, "loss": 0.0689, "num_tokens": 122511385.0, "reward": 0.416294664144516, "reward_std": 0.3271806836128235, "rewards/accuracy_reward/mean": 0.2767857015132904, "rewards/accuracy_reward/std": 0.44790977239608765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1395089328289032, "rewards/tag_count_reward/std": 0.17155487835407257, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1462.7054443359375, "completions/mean_terminated_length": 959.9834594726562, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.035799904107399715, "frac_reward_zero_std": 0.0, "grad_norm": 0.09878559508081752, "kl": 0.0010223388671875, "learning_rate": 3.553191489361702e-07, "loss": 0.0911, "num_tokens": 123237749.0, "reward": 0.5909598469734192, "reward_std": 0.4037070870399475, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1802455335855484, "rewards/tag_count_reward/std": 0.1921715885400772, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1550.55810546875, "completions/mean_terminated_length": 1016.2685546875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.036012998774705665, "frac_reward_zero_std": 0.0, "grad_norm": 0.12871041056750562, "kl": 0.0010776519775390625, "learning_rate": 3.574468085106383e-07, "loss": 0.0877, "num_tokens": 123999343.0, "reward": 0.555245578289032, "reward_std": 0.430169939994812, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.18901443481445312, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1412.0201416015625, "completions/mean_terminated_length": 921.8379516601562, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.036226093442011616, "frac_reward_zero_std": 0.0, "grad_norm": 0.10024261913313043, "kl": 0.0009641647338867188, "learning_rate": 3.5957446808510637e-07, "loss": 0.0877, "num_tokens": 124710376.0, "reward": 0.6757813096046448, "reward_std": 0.3762159049510956, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2003348171710968, "rewards/tag_count_reward/std": 0.21652869880199432, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1484.6585693359375, "completions/mean_terminated_length": 950.7086791992188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.036439188109317566, "frac_reward_zero_std": 0.0, "grad_norm": 0.10476556008040915, "kl": 0.0009899139404296875, "learning_rate": 3.617021276595745e-07, "loss": 0.1203, "num_tokens": 125451983.0, "reward": 0.482700914144516, "reward_std": 0.37238675355911255, "rewards/accuracy_reward/mean": 0.3013392984867096, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1813616007566452, "rewards/tag_count_reward/std": 0.21585306525230408, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1568.58935546875, "completions/mean_terminated_length": 911.6190185546875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.03665228277662352, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.09391397013883276, "kl": 0.0009832382202148438, "learning_rate": 3.638297872340426e-07, "loss": 0.1185, "num_tokens": 126225223.0, "reward": 0.4252232313156128, "reward_std": 0.31059473752975464, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44215917587280273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1595982164144516, "rewards/tag_count_reward/std": 0.19694946706295013, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1458.7054443359375, "completions/mean_terminated_length": 919.77783203125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.03686537744392947, "frac_reward_zero_std": 0.0, "grad_norm": 0.2044661745008977, "kl": 0.0010404586791992188, "learning_rate": 3.659574468085106e-07, "loss": 0.1051, "num_tokens": 126950115.0, "reward": 0.4542410969734192, "reward_std": 0.35335448384284973, "rewards/accuracy_reward/mean": 0.2767857015132904, "rewards/accuracy_reward/std": 0.44790980219841003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1774553507566452, "rewards/tag_count_reward/std": 0.19866611063480377, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1458.950927734375, "completions/mean_terminated_length": 925.0467529296875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.03707847211123542, "frac_reward_zero_std": 0.0, "grad_norm": 0.10181673395391216, "kl": 0.0010509490966796875, "learning_rate": 3.680851063829787e-07, "loss": 0.098, "num_tokens": 127672733.0, "reward": 0.4877232313156128, "reward_std": 0.3558724820613861, "rewards/accuracy_reward/mean": 0.3080357015132904, "rewards/accuracy_reward/std": 0.462197482585907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.21885482966899872, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1423.107177734375, "completions/mean_terminated_length": 967.104248046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.03729156677854137, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10329705877408646, "kl": 0.0010509490966796875, "learning_rate": 3.7021276595744676e-07, "loss": 0.153, "num_tokens": 128378301.0, "reward": 0.6316964626312256, "reward_std": 0.3447173535823822, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1763392835855484, "rewards/tag_count_reward/std": 0.18662908673286438, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1437.8751220703125, "completions/mean_terminated_length": 869.8275756835938, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.03750466144584732, "frac_reward_zero_std": 0.0, "grad_norm": 0.14223293621782093, "kl": 0.0011081695556640625, "learning_rate": 3.7234042553191484e-07, "loss": 0.0979, "num_tokens": 129098501.0, "reward": 0.551339328289032, "reward_std": 0.4390011727809906, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1763392835855484, "rewards/tag_count_reward/std": 0.20035871863365173, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1521.4732666015625, "completions/mean_terminated_length": 908.4637451171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03771775611315327, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10621048236220616, "kl": 0.0010623931884765625, "learning_rate": 3.744680851063829e-07, "loss": 0.0978, "num_tokens": 129853849.0, "reward": 0.4726562798023224, "reward_std": 0.3519934415817261, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.46403056383132935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16015625, "rewards/tag_count_reward/std": 0.19254150986671448, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1387.5023193359375, "completions/mean_terminated_length": 809.912109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03793085078045922, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1090509653471645, "kl": 0.00127410888671875, "learning_rate": 3.7659574468085106e-07, "loss": 0.0839, "num_tokens": 130542586.0, "reward": 0.4977678656578064, "reward_std": 0.33674728870391846, "rewards/accuracy_reward/mean": 0.3191964328289032, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1785714328289032, "rewards/tag_count_reward/std": 0.19766129553318024, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1453.8170166015625, "completions/mean_terminated_length": 961.4938354492188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.03814394544776517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10596649246298591, "kl": 0.0011997222900390625, "learning_rate": 3.7872340425531914e-07, "loss": 0.1126, "num_tokens": 131267480.0, "reward": 0.6222098469734192, "reward_std": 0.363572895526886, "rewards/accuracy_reward/mean": 0.46759259700775146, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1713169664144516, "rewards/tag_count_reward/std": 0.1732303947210312, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1402.009033203125, "completions/mean_terminated_length": 821.7118530273438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03835704011507112, "frac_reward_zero_std": 0.0, "grad_norm": 0.11570147599392522, "kl": 0.0011806488037109375, "learning_rate": 3.808510638297872e-07, "loss": 0.1471, "num_tokens": 131960428.0, "reward": 0.4899553656578064, "reward_std": 0.35651370882987976, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.47195106744766235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1685267835855484, "rewards/tag_count_reward/std": 0.18561963737010956, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1420.6719970703125, "completions/mean_terminated_length": 886.6652221679688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.03857013478237707, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10997003403263109, "kl": 0.001247406005859375, "learning_rate": 3.829787234042553e-07, "loss": 0.1163, "num_tokens": 132671193.0, "reward": 0.5524553656578064, "reward_std": 0.37749508023262024, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1685267835855484, "rewards/tag_count_reward/std": 0.17949236929416656, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1397.524658203125, "completions/mean_terminated_length": 900.7047119140625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.03878322944968302, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10221610579664953, "kl": 0.001186370849609375, "learning_rate": 3.8510638297872337e-07, "loss": 0.1086, "num_tokens": 133370180.0, "reward": 0.5831473469734192, "reward_std": 0.36309346556663513, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1880580335855484, "rewards/tag_count_reward/std": 0.20119112730026245, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1366.118408203125, "completions/mean_terminated_length": 882.0343627929688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03899632411698897, "frac_reward_zero_std": 0.0, "grad_norm": 0.10232054026371493, "kl": 0.0011768341064453125, "learning_rate": 3.8723404255319145e-07, "loss": 0.0929, "num_tokens": 134047513.0, "reward": 0.6077009439468384, "reward_std": 0.3802623450756073, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1925223171710968, "rewards/tag_count_reward/std": 0.19691382348537445, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1442.3660888671875, "completions/mean_terminated_length": 917.4833984375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.039209418784294923, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12095782636359531, "kl": 0.0012035369873046875, "learning_rate": 3.893617021276596e-07, "loss": 0.1314, "num_tokens": 134762861.0, "reward": 0.555245578289032, "reward_std": 0.32575860619544983, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1735491007566452, "rewards/tag_count_reward/std": 0.19541189074516296, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1410.774658203125, "completions/mean_terminated_length": 868.3429565429688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.039422513451600874, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09501894589540566, "kl": 0.0010967254638671875, "learning_rate": 3.9148936170212766e-07, "loss": 0.1342, "num_tokens": 135467432.0, "reward": 0.559151828289032, "reward_std": 0.3499615788459778, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.18940123915672302, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1304.6451416015625, "completions/mean_terminated_length": 823.6507568359375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.039635608118906825, "frac_reward_zero_std": 0.0, "grad_norm": 0.11156289833346619, "kl": 0.0013427734375, "learning_rate": 3.9361702127659574e-07, "loss": 0.1297, "num_tokens": 136117049.0, "reward": 0.640625, "reward_std": 0.3706938624382019, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2098214328289032, "rewards/tag_count_reward/std": 0.21494384109973907, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1588.80810546875, "completions/mean_terminated_length": 917.6813354492188, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.039848702786212775, "frac_reward_zero_std": 0.0, "grad_norm": 0.09302606053602087, "kl": 0.0011444091796875, "learning_rate": 3.957446808510638e-07, "loss": 0.1349, "num_tokens": 136892259.0, "reward": 0.3989955484867096, "reward_std": 0.33329832553863525, "rewards/accuracy_reward/mean": 0.2522321343421936, "rewards/accuracy_reward/std": 0.4347792863845825, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1467633992433548, "rewards/tag_count_reward/std": 0.18341653048992157, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1513.9442138671875, "completions/mean_terminated_length": 1029.8851318359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.040061797453518726, "frac_reward_zero_std": 0.0, "grad_norm": 26.97107691584283, "kl": 0.43701171875, "learning_rate": 3.978723404255319e-07, "loss": 0.1323, "num_tokens": 137643946.0, "reward": 0.5691964626312256, "reward_std": 0.4128606915473938, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1897321492433548, "rewards/tag_count_reward/std": 0.19995954632759094, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1372.8126220703125, "completions/mean_terminated_length": 893.4808959960938, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.040274892120824676, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10437976458759893, "kl": 0.001232147216796875, "learning_rate": 4e-07, "loss": 0.1313, "num_tokens": 138321830.0, "reward": 0.6858259439468384, "reward_std": 0.4512696862220764, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1880580335855484, "rewards/tag_count_reward/std": 0.20049497485160828, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1419.5982666015625, "completions/mean_terminated_length": 870.0753173828125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.04048798678813063, "frac_reward_zero_std": 0.0, "grad_norm": 0.10374834102028774, "kl": 0.001277923583984375, "learning_rate": 4.021276595744681e-07, "loss": 0.1116, "num_tokens": 139033826.0, "reward": 0.5301339626312256, "reward_std": 0.41140657663345337, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1573660671710968, "rewards/tag_count_reward/std": 0.18714678287506104, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1384.44873046875, "completions/mean_terminated_length": 814.5104370117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.04070108145543658, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10744661867463708, "kl": 0.0012378692626953125, "learning_rate": 4.0425531914893614e-07, "loss": 0.1451, "num_tokens": 139725307.0, "reward": 0.5078125, "reward_std": 0.3229517936706543, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1774553507566452, "rewards/tag_count_reward/std": 0.19583068788051605, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1510.5804443359375, "completions/mean_terminated_length": 825.8477172851562, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.04091417612274253, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10491696435039138, "kl": 0.001186370849609375, "learning_rate": 4.063829787234042e-07, "loss": 0.1181, "num_tokens": 140478639.0, "reward": 0.4977678656578064, "reward_std": 0.40886130928993225, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.20309406518936157, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1342.15625, "completions/mean_terminated_length": 854.7245483398438, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.04112727079004848, "frac_reward_zero_std": 0.0, "grad_norm": 0.2116215064290924, "kl": 0.0012226104736328125, "learning_rate": 4.085106382978723e-07, "loss": 0.1273, "num_tokens": 141154085.0, "reward": 0.668526828289032, "reward_std": 0.43462803959846497, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2042410671710968, "rewards/tag_count_reward/std": 0.2138228863477707, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1264.0179443359375, "completions/mean_terminated_length": 832.6920776367188, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.04134036545735443, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12045199991956956, "kl": 0.0015106201171875, "learning_rate": 4.106382978723404e-07, "loss": 0.1592, "num_tokens": 141789277.0, "reward": 0.6640625, "reward_std": 0.381980836391449, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2176339328289032, "rewards/tag_count_reward/std": 0.21431276202201843, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1439.227783203125, "completions/mean_terminated_length": 877.4849853515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.04155346012466038, "frac_reward_zero_std": 0.0, "grad_norm": 0.10252329126161434, "kl": 0.0013751983642578125, "learning_rate": 4.1276595744680846e-07, "loss": 0.1338, "num_tokens": 142507475.0, "reward": 0.5418527126312256, "reward_std": 0.44359689950942993, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1780133992433548, "rewards/tag_count_reward/std": 0.21147681772708893, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1302.9576416015625, "completions/mean_terminated_length": 793.1917724609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04176655479196633, "frac_reward_zero_std": 0.0, "grad_norm": 0.10496685851326004, "kl": 0.0014324188232421875, "learning_rate": 4.148936170212766e-07, "loss": 0.1119, "num_tokens": 143163264.0, "reward": 0.6155134439468384, "reward_std": 0.42997801303863525, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1936383992433548, "rewards/tag_count_reward/std": 0.2096090316772461, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1455.091552734375, "completions/mean_terminated_length": 823.9308471679688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.04197964945927228, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09922929912706371, "kl": 0.00128936767578125, "learning_rate": 4.1702127659574467e-07, "loss": 0.1415, "num_tokens": 143890249.0, "reward": 0.5301339626312256, "reward_std": 0.3826269507408142, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100292682647705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1685267835855484, "rewards/tag_count_reward/std": 0.19801151752471924, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1320.82373046875, "completions/mean_terminated_length": 854.6849975585938, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.04219274412657823, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10871571318377708, "kl": 0.0014801025390625, "learning_rate": 4.1914893617021275e-07, "loss": 0.1487, "num_tokens": 144551450.0, "reward": 0.6696428656578064, "reward_std": 0.41537654399871826, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1830357164144516, "rewards/tag_count_reward/std": 0.18765638768672943, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1397.977783203125, "completions/mean_terminated_length": 932.2528686523438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.04240583879388418, "frac_reward_zero_std": 0.0, "grad_norm": 0.10278064122495711, "kl": 0.0013484954833984375, "learning_rate": 4.2127659574468083e-07, "loss": 0.1236, "num_tokens": 145250672.0, "reward": 0.66015625, "reward_std": 0.35941562056541443, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1936383992433548, "rewards/tag_count_reward/std": 0.19723689556121826, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1329.415283203125, "completions/mean_terminated_length": 749.9112548828125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.04261893346119013, "frac_reward_zero_std": 0.0, "grad_norm": 0.11291643137015937, "kl": 0.001422882080078125, "learning_rate": 4.234042553191489e-07, "loss": 0.1212, "num_tokens": 145917818.0, "reward": 0.4693080484867096, "reward_std": 0.35822468996047974, "rewards/accuracy_reward/mean": 0.3194444477558136, "rewards/accuracy_reward/std": 0.4668020009994507, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1612723171710968, "rewards/tag_count_reward/std": 0.20086821913719177, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1432.060302734375, "completions/mean_terminated_length": 990.7547607421875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.04283202812849608, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10012160763790595, "kl": 0.00139617919921875, "learning_rate": 4.25531914893617e-07, "loss": 0.1277, "num_tokens": 146626437.0, "reward": 0.60546875, "reward_std": 0.3882697522640228, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.19921875, "rewards/tag_count_reward/std": 0.20292727649211884, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1527.0469970703125, "completions/mean_terminated_length": 957.406494140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.043045122795802034, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10127694106650852, "kl": 0.0013141632080078125, "learning_rate": 4.276595744680851e-07, "loss": 0.1074, "num_tokens": 147387546.0, "reward": 0.4988839626312256, "reward_std": 0.3934634327888489, "rewards/accuracy_reward/mean": 0.3325892984867096, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1662946492433548, "rewards/tag_count_reward/std": 0.20744557678699493, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1350.497802734375, "completions/mean_terminated_length": 873.2593994140625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.043258217463107984, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10468293347398858, "kl": 0.001506805419921875, "learning_rate": 4.297872340425532e-07, "loss": 0.0818, "num_tokens": 148060089.0, "reward": 0.58203125, "reward_std": 0.38847699761390686, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1891741007566452, "rewards/tag_count_reward/std": 0.20291495323181152, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1348.18310546875, "completions/mean_terminated_length": 842.1615600585938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.043471312130413935, "frac_reward_zero_std": 0.0, "grad_norm": 0.11401171455710374, "kl": 0.0015716552734375, "learning_rate": 4.319148936170213e-07, "loss": 0.1542, "num_tokens": 148731899.0, "reward": 0.6674107313156128, "reward_std": 0.41716843843460083, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2053571492433548, "rewards/tag_count_reward/std": 0.2204943597316742, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1512.7701416015625, "completions/mean_terminated_length": 953.1004028320312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.043684406797719885, "frac_reward_zero_std": 0.0, "grad_norm": 5.156304426547695, "kl": 0.200439453125, "learning_rate": 4.3404255319148936e-07, "loss": 0.1376, "num_tokens": 149483876.0, "reward": 0.4871652126312256, "reward_std": 0.34277215600013733, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.46403056383132935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1746651828289032, "rewards/tag_count_reward/std": 0.2069537490606308, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1311.216552734375, "completions/mean_terminated_length": 753.5725708007812, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.043897501465025836, "frac_reward_zero_std": 0.0, "grad_norm": 0.11912765987063775, "kl": 0.00160980224609375, "learning_rate": 4.3617021276595744e-07, "loss": 0.1512, "num_tokens": 150145973.0, "reward": 0.6702009439468384, "reward_std": 0.38861900568008423, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1902901828289032, "rewards/tag_count_reward/std": 0.19409781694412231, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1315.0692138671875, "completions/mean_terminated_length": 871.1075439453125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.044110596132331786, "frac_reward_zero_std": 0.0, "grad_norm": 0.22304967037251217, "kl": 0.001598358154296875, "learning_rate": 4.382978723404255e-07, "loss": 0.1081, "num_tokens": 150805540.0, "reward": 0.61328125, "reward_std": 0.41968443989753723, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1958705335855484, "rewards/tag_count_reward/std": 0.1999712586402893, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1439.07373046875, "completions/mean_terminated_length": 925.370361328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04432369079963774, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11242731061780704, "kl": 0.0016345977783203125, "learning_rate": 4.404255319148936e-07, "loss": 0.1042, "num_tokens": 151517813.0, "reward": 0.5887277126312256, "reward_std": 0.4099920988082886, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.20703125, "rewards/tag_count_reward/std": 0.22860509157180786, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1487.9442138671875, "completions/mean_terminated_length": 927.888427734375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.04453678546694369, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09382352806250878, "kl": 0.0014190673828125, "learning_rate": 4.425531914893617e-07, "loss": 0.0891, "num_tokens": 152265724.0, "reward": 0.5758928656578064, "reward_std": 0.3854631185531616, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1919642835855484, "rewards/tag_count_reward/std": 0.2211727499961853, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1356.9107666015625, "completions/mean_terminated_length": 913.90478515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.04474988013424964, "frac_reward_zero_std": 0.0, "grad_norm": 0.11980874924488194, "kl": 0.0016326904296875, "learning_rate": 4.4468085106382975e-07, "loss": 0.1465, "num_tokens": 152940100.0, "reward": 0.7003348469734192, "reward_std": 0.37838613986968994, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2159598171710968, "rewards/tag_count_reward/std": 0.2104293406009674, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1390.9844970703125, "completions/mean_terminated_length": 902.6964721679688, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.04496297480155559, "frac_reward_zero_std": 0.0, "grad_norm": 0.11965225096943578, "kl": 0.00153350830078125, "learning_rate": 4.4680851063829783e-07, "loss": 0.0594, "num_tokens": 153625533.0, "reward": 0.6640625, "reward_std": 0.41250860691070557, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2109375, "rewards/tag_count_reward/std": 0.20034313201904297, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1366.4376220703125, "completions/mean_terminated_length": 961.3807983398438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.04517606946886154, "frac_reward_zero_std": 0.0, "grad_norm": 0.11557599958263355, "kl": 0.0016002655029296875, "learning_rate": 4.489361702127659e-07, "loss": 0.1284, "num_tokens": 154304433.0, "reward": 0.6428571939468384, "reward_std": 0.4596438705921173, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2075892835855484, "rewards/tag_count_reward/std": 0.1989581137895584, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1297.5067138671875, "completions/mean_terminated_length": 816.4212646484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04538916413616749, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11110708060337988, "kl": 0.00167083740234375, "learning_rate": 4.51063829787234e-07, "loss": 0.1593, "num_tokens": 154948516.0, "reward": 0.6434152126312256, "reward_std": 0.4187160134315491, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.21484375, "rewards/tag_count_reward/std": 0.22871975600719452, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1404.7344970703125, "completions/mean_terminated_length": 939.6038208007812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04560225880347344, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10196443242568666, "kl": 0.00173187255859375, "learning_rate": 4.5319148936170207e-07, "loss": 0.0848, "num_tokens": 155643757.0, "reward": 0.6238839626312256, "reward_std": 0.3447335958480835, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1863839328289032, "rewards/tag_count_reward/std": 0.18955937027931213, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1305.90625, "completions/mean_terminated_length": 860.6500244140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04581535347077939, "frac_reward_zero_std": 0.0, "grad_norm": 0.11670707779154094, "kl": 0.001613616943359375, "learning_rate": 4.553191489361702e-07, "loss": 0.0844, "num_tokens": 156303331.0, "reward": 0.609933078289032, "reward_std": 0.40606167912483215, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.21484375, "rewards/tag_count_reward/std": 0.2122310847043991, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1245.2879638671875, "completions/mean_terminated_length": 828.966064453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.04602844813808534, "frac_reward_zero_std": 0.0, "grad_norm": 0.1117023704373016, "kl": 0.0017108917236328125, "learning_rate": 4.574468085106383e-07, "loss": 0.1098, "num_tokens": 156931108.0, "reward": 0.6869419813156128, "reward_std": 0.42914876341819763, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2137276828289032, "rewards/tag_count_reward/std": 0.19629153609275818, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1352.977783203125, "completions/mean_terminated_length": 873.0188598632812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.04624154280539129, "frac_reward_zero_std": 0.0, "grad_norm": 0.10908169488479595, "kl": 0.0015811920166015625, "learning_rate": 4.5957446808510636e-07, "loss": 0.0976, "num_tokens": 157599354.0, "reward": 0.6233259439468384, "reward_std": 0.41876834630966187, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1969866007566452, "rewards/tag_count_reward/std": 0.20373158156871796, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1310.982177734375, "completions/mean_terminated_length": 737.74609375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.04645463747269724, "frac_reward_zero_std": 0.0, "grad_norm": 0.13422275796443275, "kl": 0.0016231536865234375, "learning_rate": 4.6170212765957444e-07, "loss": 0.1512, "num_tokens": 158250514.0, "reward": 0.5217634439468384, "reward_std": 0.3856152892112732, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1869419664144516, "rewards/tag_count_reward/std": 0.1959095597267151, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1323.37060546875, "completions/mean_terminated_length": 912.9160766601562, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.04666773214000319, "frac_reward_zero_std": 0.0, "grad_norm": 0.1491887727377492, "kl": 0.0015430450439453125, "learning_rate": 4.638297872340425e-07, "loss": 0.1082, "num_tokens": 158908072.0, "reward": 0.7265625596046448, "reward_std": 0.4064008891582489, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2220982164144516, "rewards/tag_count_reward/std": 0.20698769390583038, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1476.107177734375, "completions/mean_terminated_length": 1002.2529907226562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.04688082680730915, "frac_reward_zero_std": 0.0, "grad_norm": 0.40464057792231767, "kl": 0.007049560546875, "learning_rate": 4.659574468085106e-07, "loss": 0.1002, "num_tokens": 159645704.0, "reward": 0.6155134439468384, "reward_std": 0.3887866735458374, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.19140625, "rewards/tag_count_reward/std": 0.19658386707305908, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1428.685302734375, "completions/mean_terminated_length": 1008.8502197265625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0470939214746151, "frac_reward_zero_std": 0.0, "grad_norm": 0.10143971225602694, "kl": 0.0016422271728515625, "learning_rate": 4.6808510638297873e-07, "loss": 0.0794, "num_tokens": 160358987.0, "reward": 0.6852678656578064, "reward_std": 0.4163140654563904, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2165178507566452, "rewards/tag_count_reward/std": 0.22183778882026672, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1406.0357666015625, "completions/mean_terminated_length": 813.6652221679688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.04730701614192105, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.0996447702921371, "kl": 0.00140380859375, "learning_rate": 4.702127659574468e-07, "loss": 0.111, "num_tokens": 161054443.0, "reward": 0.4910714626312256, "reward_std": 0.3968755006790161, "rewards/accuracy_reward/mean": 0.3080357015132904, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1830357164144516, "rewards/tag_count_reward/std": 0.19639405608177185, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1249.74560546875, "completions/mean_terminated_length": 775.3380737304688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.047520110809227, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11154026812212577, "kl": 0.0016765594482421875, "learning_rate": 4.723404255319149e-07, "loss": 0.1118, "num_tokens": 161679465.0, "reward": 0.6328125, "reward_std": 0.3738306760787964, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1819196492433548, "rewards/tag_count_reward/std": 0.17885135114192963, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1425.3148193359375, "completions/mean_terminated_length": 987.30419921875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.04773320547653295, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.22204444435954832, "kl": 0.003631591796875, "learning_rate": 4.7446808510638297e-07, "loss": 0.1246, "num_tokens": 162393254.0, "reward": 0.6718750596046448, "reward_std": 0.387226939201355, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.203125, "rewards/tag_count_reward/std": 0.20283572375774384, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1356.352783203125, "completions/mean_terminated_length": 856.2384643554688, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0479463001438389, "frac_reward_zero_std": 0.0, "grad_norm": 0.16879309262851652, "kl": 0.0016574859619140625, "learning_rate": 4.7659574468085105e-07, "loss": 0.1004, "num_tokens": 163068660.0, "reward": 0.5870535969734192, "reward_std": 0.39906179904937744, "rewards/accuracy_reward/mean": 0.38657405972480774, "rewards/accuracy_reward/std": 0.4875292479991913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2142857164144516, "rewards/tag_count_reward/std": 0.22832709550857544, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1304.0179443359375, "completions/mean_terminated_length": 866.0709228515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.048159394811144854, "frac_reward_zero_std": 0.0, "grad_norm": 0.11507590868466241, "kl": 0.0015659332275390625, "learning_rate": 4.787234042553192e-07, "loss": 0.098, "num_tokens": 163723852.0, "reward": 0.54296875, "reward_std": 0.370231568813324, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2036830335855484, "rewards/tag_count_reward/std": 0.20806674659252167, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1363.09375, "completions/mean_terminated_length": 876.8626098632812, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.048372489478450804, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10271745026307122, "kl": 0.00164794921875, "learning_rate": 4.808510638297872e-07, "loss": 0.1139, "num_tokens": 164405494.0, "reward": 0.5870535969734192, "reward_std": 0.34285223484039307, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2008928507566452, "rewards/tag_count_reward/std": 0.20505164563655853, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1392.8438720703125, "completions/mean_terminated_length": 896.98046875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.048585584145756755, "frac_reward_zero_std": 0.0, "grad_norm": 0.11303936666548152, "kl": 0.0016937255859375, "learning_rate": 4.829787234042552e-07, "loss": 0.1239, "num_tokens": 165100912.0, "reward": 0.5915178656578064, "reward_std": 0.330929160118103, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2165178507566452, "rewards/tag_count_reward/std": 0.23109875619411469, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1426.247802734375, "completions/mean_terminated_length": 929.3453369140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.048798678813062706, "frac_reward_zero_std": 0.0, "grad_norm": 1.8870956516781585, "kl": 0.0076580047607421875, "learning_rate": 4.851063829787234e-07, "loss": 0.1046, "num_tokens": 165819263.0, "reward": 0.590401828289032, "reward_std": 0.4257414937019348, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2020089328289032, "rewards/tag_count_reward/std": 0.2113564908504486, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1404.247802734375, "completions/mean_terminated_length": 1010.5863647460938, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.049011773480368656, "frac_reward_zero_std": 0.0, "grad_norm": 0.10398513362142894, "kl": 0.0017681121826171875, "learning_rate": 4.872340425531915e-07, "loss": 0.1286, "num_tokens": 166520574.0, "reward": 0.6579241156578064, "reward_std": 0.413931667804718, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2248883992433548, "rewards/tag_count_reward/std": 0.22010265290737152, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1244.966552734375, "completions/mean_terminated_length": 828.4779663085938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.04922486814767461, "frac_reward_zero_std": 0.0, "grad_norm": 0.11912494459626313, "kl": 0.0019626617431640625, "learning_rate": 4.893617021276595e-07, "loss": 0.18, "num_tokens": 167142687.0, "reward": 0.707589328289032, "reward_std": 0.45271551609039307, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2276785671710968, "rewards/tag_count_reward/std": 0.21034106612205505, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1387.950927734375, "completions/mean_terminated_length": 865.1920166015625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.04943796281498056, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.09625235051488164, "kl": 0.001678466796875, "learning_rate": 4.914893617021277e-07, "loss": 0.1264, "num_tokens": 167830249.0, "reward": 0.5345982313156128, "reward_std": 0.42067062854766846, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1908482164144516, "rewards/tag_count_reward/std": 0.2016845941543579, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1409.7054443359375, "completions/mean_terminated_length": 984.966552734375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.04965105748228651, "frac_reward_zero_std": 0.0, "grad_norm": 2.22790115880219, "kl": 0.05279541015625, "learning_rate": 4.936170212765957e-07, "loss": 0.1316, "num_tokens": 168538341.0, "reward": 0.590401828289032, "reward_std": 0.3608662784099579, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2064732164144516, "rewards/tag_count_reward/std": 0.18712009489536285, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1324.9910888671875, "completions/mean_terminated_length": 934.9141235351562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.04986415214959246, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10245777746963397, "kl": 0.0017719268798828125, "learning_rate": 4.957446808510638e-07, "loss": 0.1033, "num_tokens": 169198449.0, "reward": 0.609375, "reward_std": 0.3380663990974426, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2142857164144516, "rewards/tag_count_reward/std": 0.207135409116745, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1462.149658203125, "completions/mean_terminated_length": 881.5067138671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.05007724681689841, "frac_reward_zero_std": 0.0, "grad_norm": 0.10375067384388459, "kl": 0.001827239990234375, "learning_rate": 4.978723404255318e-07, "loss": 0.0947, "num_tokens": 169930804.0, "reward": 0.5915178656578064, "reward_std": 0.3607367277145386, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2098214328289032, "rewards/tag_count_reward/std": 0.22261303663253784, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1387.46435546875, "completions/mean_terminated_length": 914.2068481445312, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.05029034148420436, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10549517129023503, "kl": 0.001728057861328125, "learning_rate": 5e-07, "loss": 0.1101, "num_tokens": 170619508.0, "reward": 0.645089328289032, "reward_std": 0.41320767998695374, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2098214328289032, "rewards/tag_count_reward/std": 0.22008632123470306, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1505.732177734375, "completions/mean_terminated_length": 1056.4244384765625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.05050343615151031, "frac_reward_zero_std": 0.0, "grad_norm": 0.10106960498822302, "kl": 0.0016994476318359375, "learning_rate": 5.02127659574468e-07, "loss": 0.0977, "num_tokens": 171365132.0, "reward": 0.51171875, "reward_std": 0.4081592857837677, "rewards/accuracy_reward/mean": 0.3013392984867096, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2103794664144516, "rewards/tag_count_reward/std": 0.2322404831647873, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1284.4866943359375, "completions/mean_terminated_length": 847.810546875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.05071653081881626, "frac_reward_zero_std": 0.0, "grad_norm": 0.12018573210774196, "kl": 0.002071380615234375, "learning_rate": 5.042553191489361e-07, "loss": 0.0961, "num_tokens": 172004150.0, "reward": 0.6261160969734192, "reward_std": 0.368063747882843, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2154017835855484, "rewards/tag_count_reward/std": 0.20460976660251617, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1364.388427734375, "completions/mean_terminated_length": 922.0514526367188, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.05092962548612221, "frac_reward_zero_std": 0.0, "grad_norm": 0.20252276414839693, "kl": 0.0020904541015625, "learning_rate": 5.063829787234042e-07, "loss": 0.1213, "num_tokens": 172681172.0, "reward": 0.617745578289032, "reward_std": 0.3969407379627228, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2315848171710968, "rewards/tag_count_reward/std": 0.23128168284893036, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1396.1942138671875, "completions/mean_terminated_length": 884.6175537109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.05114272015342816, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10280922103701856, "kl": 0.0018596649169921875, "learning_rate": 5.085106382978723e-07, "loss": 0.1412, "num_tokens": 173370619.0, "reward": 0.5189732313156128, "reward_std": 0.35032397508621216, "rewards/accuracy_reward/mean": 0.3102678656578064, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2087053507566452, "rewards/tag_count_reward/std": 0.22921931743621826, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1334.212158203125, "completions/mean_terminated_length": 813.3397827148438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.05135581482073411, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10163979671398084, "kl": 0.0020809173583984375, "learning_rate": 5.106382978723403e-07, "loss": 0.1552, "num_tokens": 174038906.0, "reward": 0.6501116156578064, "reward_std": 0.4021851718425751, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.23046875, "rewards/tag_count_reward/std": 0.23950733244419098, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1392.9107666015625, "completions/mean_terminated_length": 840.2633666992188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05156890948804006, "frac_reward_zero_std": 0.0, "grad_norm": 0.09688813276520299, "kl": 0.0017910003662109375, "learning_rate": 5.127659574468085e-07, "loss": 0.1088, "num_tokens": 174730418.0, "reward": 0.5340402126312256, "reward_std": 0.3654673397541046, "rewards/accuracy_reward/mean": 0.3214285671710968, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2126116007566452, "rewards/tag_count_reward/std": 0.22341284155845642, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1415.7991943359375, "completions/mean_terminated_length": 945.9533081054688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05178200415534601, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.083676560633267, "kl": 0.0262298583984375, "learning_rate": 5.148936170212766e-07, "loss": 0.0713, "num_tokens": 175436376.0, "reward": 0.5496652126312256, "reward_std": 0.3902077376842499, "rewards/accuracy_reward/mean": 0.35185185074806213, "rewards/accuracy_reward/std": 0.4781017303466797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2103794664144516, "rewards/tag_count_reward/std": 0.21472229063510895, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1399.1407470703125, "completions/mean_terminated_length": 916.9143676757812, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.051995098822651964, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11236993357710642, "kl": 0.001888275146484375, "learning_rate": 5.170212765957447e-07, "loss": 0.082, "num_tokens": 176131559.0, "reward": 0.6696428656578064, "reward_std": 0.4155312478542328, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2299107164144516, "rewards/tag_count_reward/std": 0.23504102230072021, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1472.3013916015625, "completions/mean_terminated_length": 916.8026123046875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.052208193489957914, "frac_reward_zero_std": 0.0, "grad_norm": 0.0985172887983217, "kl": 0.0018062591552734375, "learning_rate": 5.191489361702127e-07, "loss": 0.1042, "num_tokens": 176868382.0, "reward": 0.5172991156578064, "reward_std": 0.39164578914642334, "rewards/accuracy_reward/mean": 0.3080357015132904, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2092633992433548, "rewards/tag_count_reward/std": 0.2489084005355835, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1364.2366943359375, "completions/mean_terminated_length": 909.2416381835938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.052421288157263865, "frac_reward_zero_std": 0.0, "grad_norm": 0.10421536892634764, "kl": 0.0019969940185546875, "learning_rate": 5.212765957446809e-07, "loss": 0.115, "num_tokens": 177553192.0, "reward": 0.6579241156578064, "reward_std": 0.4745548963546753, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2449776828289032, "rewards/tag_count_reward/std": 0.2499494105577469, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1244.046875, "completions/mean_terminated_length": 752.4208984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.052634382824569816, "frac_reward_zero_std": 0.0, "grad_norm": 0.11961361883166327, "kl": 0.002269744873046875, "learning_rate": 5.234042553191489e-07, "loss": 0.15, "num_tokens": 178172957.0, "reward": 0.625558078289032, "reward_std": 0.38029906153678894, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2193080335855484, "rewards/tag_count_reward/std": 0.23656509816646576, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1305.4375, "completions/mean_terminated_length": 811.3159790039062, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.052847477491875766, "frac_reward_zero_std": 0.0, "grad_norm": 0.10489620471694665, "kl": 0.002223968505859375, "learning_rate": 5.25531914893617e-07, "loss": 0.1139, "num_tokens": 178825393.0, "reward": 0.6579241156578064, "reward_std": 0.42199236154556274, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.25822341442108154, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1375.1563720703125, "completions/mean_terminated_length": 875.1050415039062, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.05306057215918172, "frac_reward_zero_std": 0.0, "grad_norm": 0.12310158123222194, "kl": 0.0020694732666015625, "learning_rate": 5.276595744680851e-07, "loss": 0.0738, "num_tokens": 179513991.0, "reward": 0.6166294813156128, "reward_std": 0.44393911957740784, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2349330335855484, "rewards/tag_count_reward/std": 0.24215105175971985, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1395.6585693359375, "completions/mean_terminated_length": 879.0040283203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.05327366682648767, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10143866384812297, "kl": 0.0019588470458984375, "learning_rate": 5.297872340425532e-07, "loss": 0.0952, "num_tokens": 180207038.0, "reward": 0.5993303656578064, "reward_std": 0.3879188299179077, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2109375, "rewards/tag_count_reward/std": 0.2190144956111908, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1411.87060546875, "completions/mean_terminated_length": 960.2671508789062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.05348676149379362, "frac_reward_zero_std": 0.0, "grad_norm": 0.1127356505588024, "kl": 0.002155303955078125, "learning_rate": 5.319148936170212e-07, "loss": 0.0951, "num_tokens": 180912340.0, "reward": 0.5290178656578064, "reward_std": 0.34892863035202026, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.47195106744766235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2075892835855484, "rewards/tag_count_reward/std": 0.22655968368053436, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1337.5804443359375, "completions/mean_terminated_length": 837.8555297851562, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.05369985616109957, "frac_reward_zero_std": 0.0, "grad_norm": 0.10831817968873333, "kl": 0.002429962158203125, "learning_rate": 5.340425531914894e-07, "loss": 0.1064, "num_tokens": 181574008.0, "reward": 0.6004464626312256, "reward_std": 0.36334195733070374, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.234375, "rewards/tag_count_reward/std": 0.24182668328285217, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1428.93310546875, "completions/mean_terminated_length": 968.8482666015625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.05391295082840552, "frac_reward_zero_std": 0.0, "grad_norm": 0.10370107091258478, "kl": 0.0022106170654296875, "learning_rate": 5.361702127659574e-07, "loss": 0.1416, "num_tokens": 182285594.0, "reward": 0.6428571939468384, "reward_std": 0.3974340856075287, "rewards/accuracy_reward/mean": 0.4305555522441864, "rewards/accuracy_reward/std": 0.495728075504303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2276785671710968, "rewards/tag_count_reward/std": 0.24071939289569855, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1354.828125, "completions/mean_terminated_length": 780.4856567382812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05412604549571147, "frac_reward_zero_std": 0.0, "grad_norm": 0.574807841611994, "kl": 0.0021686553955078125, "learning_rate": 5.382978723404255e-07, "loss": 0.1137, "num_tokens": 182964333.0, "reward": 0.6082589626312256, "reward_std": 0.4060520529747009, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2087053507566452, "rewards/tag_count_reward/std": 0.23464499413967133, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1302.5513916015625, "completions/mean_terminated_length": 888.4132080078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05433914016301742, "frac_reward_zero_std": 0.0, "grad_norm": 0.1132689062105518, "kl": 0.0024566650390625, "learning_rate": 5.404255319148936e-07, "loss": 0.1481, "num_tokens": 183617268.0, "reward": 0.6534598469734192, "reward_std": 0.38801801204681396, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2516741156578064, "rewards/tag_count_reward/std": 0.25055304169654846, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1307.274658203125, "completions/mean_terminated_length": 858.5913696289062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.05455223483032337, "frac_reward_zero_std": 0.0, "grad_norm": 0.10981357267287487, "kl": 0.0023651123046875, "learning_rate": 5.425531914893617e-07, "loss": 0.1381, "num_tokens": 184268719.0, "reward": 0.7204241156578064, "reward_std": 0.4415828585624695, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2516741156578064, "rewards/tag_count_reward/std": 0.23737114667892456, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1449.47998046875, "completions/mean_terminated_length": 944.5555419921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05476532949762932, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11774942896434641, "kl": 0.00235748291015625, "learning_rate": 5.446808510638298e-07, "loss": 0.1221, "num_tokens": 184995782.0, "reward": 0.5954241156578064, "reward_std": 0.40222373604774475, "rewards/accuracy_reward/mean": 0.3958333432674408, "rewards/accuracy_reward/std": 0.4895959198474884, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2137276828289032, "rewards/tag_count_reward/std": 0.23695528507232666, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1398.2254638671875, "completions/mean_terminated_length": 874.2136840820312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.05497842416493527, "frac_reward_zero_std": 0.0, "grad_norm": 1.0722756405134082, "kl": 0.00913238525390625, "learning_rate": 5.468085106382978e-07, "loss": 0.103, "num_tokens": 185705531.0, "reward": 0.498325914144516, "reward_std": 0.38193479180336, "rewards/accuracy_reward/mean": 0.2777777910232544, "rewards/accuracy_reward/std": 0.44842249155044556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.23046875, "rewards/tag_count_reward/std": 0.23538535833358765, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1431.274658203125, "completions/mean_terminated_length": 947.2310791015625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.05519151883224122, "frac_reward_zero_std": 0.0, "grad_norm": 0.10516389543313849, "kl": 0.00211334228515625, "learning_rate": 5.48936170212766e-07, "loss": 0.1272, "num_tokens": 186416982.0, "reward": 0.5876116156578064, "reward_std": 0.4079829156398773, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2126116007566452, "rewards/tag_count_reward/std": 0.22215762734413147, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1346.0670166015625, "completions/mean_terminated_length": 920.8817138671875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.05540461349954717, "frac_reward_zero_std": 0.0, "grad_norm": 0.1492683471879114, "kl": 0.0026035308837890625, "learning_rate": 5.51063829787234e-07, "loss": 0.0796, "num_tokens": 187090148.0, "reward": 0.6467634439468384, "reward_std": 0.3375851809978485, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2449776828289032, "rewards/tag_count_reward/std": 0.2384992092847824, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1581.982177734375, "completions/mean_terminated_length": 1029.5804443359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.05561770816685312, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09441529676649732, "kl": 0.0018463134765625, "learning_rate": 5.531914893617021e-07, "loss": 0.0808, "num_tokens": 187870380.0, "reward": 0.5167410969734192, "reward_std": 0.4009559750556946, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.19876663386821747, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1360.243408203125, "completions/mean_terminated_length": 955.3936157226562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.055830802834159074, "frac_reward_zero_std": 0.0, "grad_norm": 0.12006458425020748, "kl": 0.0020923614501953125, "learning_rate": 5.553191489361701e-07, "loss": 0.1594, "num_tokens": 188546233.0, "reward": 0.6651785969734192, "reward_std": 0.4914795458316803, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2388392835855484, "rewards/tag_count_reward/std": 0.2432268112897873, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1402.8951416015625, "completions/mean_terminated_length": 940.6934814453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.056043897501465025, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10158763031076198, "kl": 0.002044677734375, "learning_rate": 5.574468085106383e-07, "loss": 0.1016, "num_tokens": 189247962.0, "reward": 0.6863839626312256, "reward_std": 0.4087267220020294, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.23695333302021027, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1272.4129638671875, "completions/mean_terminated_length": 833.0944213867188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.056256992168770975, "frac_reward_zero_std": 0.0, "grad_norm": 0.11167561631369617, "kl": 0.002597808837890625, "learning_rate": 5.595744680851063e-07, "loss": 0.1561, "num_tokens": 189883235.0, "reward": 0.6328125, "reward_std": 0.4506607949733734, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.23813055455684662, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1477.0535888671875, "completions/mean_terminated_length": 950.214599609375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.056470086836076926, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.09737646782863565, "kl": 0.002094268798828125, "learning_rate": 5.617021276595744e-07, "loss": 0.0892, "num_tokens": 190619819.0, "reward": 0.5580357313156128, "reward_std": 0.3536396324634552, "rewards/accuracy_reward/mean": 0.3325892984867096, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2254464328289032, "rewards/tag_count_reward/std": 0.24681374430656433, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1355.122802734375, "completions/mean_terminated_length": 894.0631713867188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.056683181503382876, "frac_reward_zero_std": 0.0, "grad_norm": 0.11131382201245558, "kl": 0.002262115478515625, "learning_rate": 5.638297872340425e-07, "loss": 0.1134, "num_tokens": 191298946.0, "reward": 0.6830357313156128, "reward_std": 0.42979133129119873, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2522321343421936, "rewards/tag_count_reward/std": 0.24232175946235657, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1306.904052734375, "completions/mean_terminated_length": 799.83837890625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.05689627617068883, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16666765437193687, "kl": 0.002765655517578125, "learning_rate": 5.659574468085107e-07, "loss": 0.1017, "num_tokens": 191952439.0, "reward": 0.6556919813156128, "reward_std": 0.40206488966941833, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2717633843421936, "rewards/tag_count_reward/std": 0.26057201623916626, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1388.9576416015625, "completions/mean_terminated_length": 894.67578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.05710937083799478, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1011795657906066, "kl": 0.00235748291015625, "learning_rate": 5.680851063829787e-07, "loss": 0.1263, "num_tokens": 192640932.0, "reward": 0.6322544813156128, "reward_std": 0.3782602846622467, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2371651828289032, "rewards/tag_count_reward/std": 0.2502289414405823, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1309.852783203125, "completions/mean_terminated_length": 862.731201171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.05732246550530073, "frac_reward_zero_std": 0.0, "grad_norm": 0.14259109791603855, "kl": 0.00262451171875, "learning_rate": 5.702127659574469e-07, "loss": 0.147, "num_tokens": 193294466.0, "reward": 0.6439732313156128, "reward_std": 0.4190162122249603, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2667410671710968, "rewards/tag_count_reward/std": 0.2670345902442932, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1332.7098388671875, "completions/mean_terminated_length": 895.3021850585938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.05753556017260668, "frac_reward_zero_std": 0.0, "grad_norm": 0.15700509761295714, "kl": 0.00263214111328125, "learning_rate": 5.723404255319149e-07, "loss": 0.1043, "num_tokens": 193962272.0, "reward": 0.6707589626312256, "reward_std": 0.40346354246139526, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2600446343421936, "rewards/tag_count_reward/std": 0.252857506275177, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1463.8282470703125, "completions/mean_terminated_length": 924.785400390625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.05774865483991263, "frac_reward_zero_std": 0.0, "grad_norm": 0.09990787000231265, "kl": 0.002315521240234375, "learning_rate": 5.74468085106383e-07, "loss": 0.0854, "num_tokens": 194690915.0, "reward": 0.6473214626312256, "reward_std": 0.4259307086467743, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2321428507566452, "rewards/tag_count_reward/std": 0.2507578730583191, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1163.171875, "completions/mean_terminated_length": 832.0398559570312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05796174950721858, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.3606952081114682, "kl": 0.00356292724609375, "learning_rate": 5.76595744680851e-07, "loss": 0.0752, "num_tokens": 195278576.0, "reward": 0.8822544813156128, "reward_std": 0.4029451310634613, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2862723171710968, "rewards/tag_count_reward/std": 0.2632318139076233, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1444.0491943359375, "completions/mean_terminated_length": 934.5431518554688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.05817484417452453, "frac_reward_zero_std": 0.0, "grad_norm": 0.10636257933505665, "kl": 0.002384185791015625, "learning_rate": 5.787234042553191e-07, "loss": 0.1347, "num_tokens": 195992230.0, "reward": 0.6953125596046448, "reward_std": 0.46234703063964844, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2667410671710968, "rewards/tag_count_reward/std": 0.25906145572662354, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1375.013427734375, "completions/mean_terminated_length": 990.1123046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.05838793884183048, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11160007530533718, "kl": 0.002391815185546875, "learning_rate": 5.808510638297872e-07, "loss": 0.0999, "num_tokens": 196681724.0, "reward": 0.6914063096046448, "reward_std": 0.38176435232162476, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2561383843421936, "rewards/tag_count_reward/std": 0.23788601160049438, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1268.8638916015625, "completions/mean_terminated_length": 914.7110595703125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.05860103350913643, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11314332249872504, "kl": 0.00270843505859375, "learning_rate": 5.829787234042552e-07, "loss": 0.1064, "num_tokens": 197316591.0, "reward": 0.7561384439468384, "reward_std": 0.38534533977508545, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2717633843421936, "rewards/tag_count_reward/std": 0.24508734047412872, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1347.029052734375, "completions/mean_terminated_length": 942.242919921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.05881412817644238, "frac_reward_zero_std": 0.0, "grad_norm": 0.11322320281886235, "kl": 0.00263214111328125, "learning_rate": 5.851063829787234e-07, "loss": 0.1232, "num_tokens": 197984636.0, "reward": 0.8565848469734192, "reward_std": 0.45551785826683044, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2896205484867096, "rewards/tag_count_reward/std": 0.2562822997570038, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1441.3013916015625, "completions/mean_terminated_length": 910.75732421875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.05902722284374833, "frac_reward_zero_std": 0.0, "grad_norm": 0.0981790236901551, "kl": 0.002227783203125, "learning_rate": 5.872340425531914e-07, "loss": 0.1128, "num_tokens": 198705715.0, "reward": 0.60546875, "reward_std": 0.4871624708175659, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2282366007566452, "rewards/tag_count_reward/std": 0.24565714597702026, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1352.680908203125, "completions/mean_terminated_length": 977.54296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.05924031751105428, "frac_reward_zero_std": 0.0, "grad_norm": 0.12201504936080408, "kl": 0.002719879150390625, "learning_rate": 5.893617021276595e-07, "loss": 0.0999, "num_tokens": 199381140.0, "reward": 0.640625, "reward_std": 0.3989824056625366, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2790178656578064, "rewards/tag_count_reward/std": 0.24461889266967773, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1402.9420166015625, "completions/mean_terminated_length": 910.2598266601562, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.05945341217836023, "frac_reward_zero_std": 0.0, "grad_norm": 1.934626448161137, "kl": 0.00521087646484375, "learning_rate": 5.914893617021275e-07, "loss": 0.0746, "num_tokens": 200084890.0, "reward": 0.58984375, "reward_std": 0.3934418261051178, "rewards/accuracy_reward/mean": 0.3147321343421936, "rewards/accuracy_reward/std": 0.4649282991886139, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2751116156578064, "rewards/tag_count_reward/std": 0.27743226289749146, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1368.24560546875, "completions/mean_terminated_length": 885.6717529296875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.059666506845666184, "frac_reward_zero_std": 0.0, "grad_norm": 0.107300934420347, "kl": 0.00250244140625, "learning_rate": 5.936170212765958e-07, "loss": 0.1175, "num_tokens": 200772744.0, "reward": 0.6395089626312256, "reward_std": 0.4106302857398987, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2511160671710968, "rewards/tag_count_reward/std": 0.2680797576904297, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1372.4554443359375, "completions/mean_terminated_length": 837.4240112304688, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.059879601512972135, "frac_reward_zero_std": 0.0, "grad_norm": 0.09809112278554397, "kl": 0.002422332763671875, "learning_rate": 5.957446808510638e-07, "loss": 0.1306, "num_tokens": 201465812.0, "reward": 0.640625, "reward_std": 0.447689950466156, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2745535671710968, "rewards/tag_count_reward/std": 0.27723026275634766, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1347.618408203125, "completions/mean_terminated_length": 907.0145263671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.060092696180278085, "frac_reward_zero_std": 0.0, "grad_norm": 0.12191400328705543, "kl": 0.002895355224609375, "learning_rate": 5.978723404255319e-07, "loss": 0.0979, "num_tokens": 202137801.0, "reward": 0.7020089626312256, "reward_std": 0.42347556352615356, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2912946343421936, "rewards/tag_count_reward/std": 0.2643469274044037, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1412.4107666015625, "completions/mean_terminated_length": 913.561767578125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.06030579084758404, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10329346225576208, "kl": 0.002796173095703125, "learning_rate": 6e-07, "loss": 0.1379, "num_tokens": 202844657.0, "reward": 0.6205357313156128, "reward_std": 0.41759294271469116, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.2800706923007965, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1258.607177734375, "completions/mean_terminated_length": 841.0101928710938, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.06051888551488999, "frac_reward_zero_std": 0.0, "grad_norm": 0.117627392301162, "kl": 0.003108978271484375, "learning_rate": 6.021276595744681e-07, "loss": 0.1359, "num_tokens": 203475953.0, "reward": 0.7706473469734192, "reward_std": 0.4639025330543518, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3197544515132904, "rewards/tag_count_reward/std": 0.2838204801082611, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1422.8148193359375, "completions/mean_terminated_length": 953.92578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.060731980182195944, "frac_reward_zero_std": 0.0, "grad_norm": 0.10814723346279091, "kl": 0.002681732177734375, "learning_rate": 6.042553191489361e-07, "loss": 0.1101, "num_tokens": 204181134.0, "reward": 0.65234375, "reward_std": 0.42916712164878845, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2438616007566452, "rewards/tag_count_reward/std": 0.2651267945766449, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1309.6317138671875, "completions/mean_terminated_length": 870.81494140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.060945074849501894, "frac_reward_zero_std": 0.0, "grad_norm": 0.1961563161108591, "kl": 0.003780364990234375, "learning_rate": 6.063829787234043e-07, "loss": 0.1511, "num_tokens": 204834713.0, "reward": 0.7885044813156128, "reward_std": 0.42053401470184326, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3510044515132904, "rewards/tag_count_reward/std": 0.2876738905906677, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1340.3125, "completions/mean_terminated_length": 851.6075439453125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.061158169516807845, "frac_reward_zero_std": 0.0, "grad_norm": 0.1041817157682442, "kl": 0.003040313720703125, "learning_rate": 6.085106382978723e-07, "loss": 0.13, "num_tokens": 205505381.0, "reward": 0.6992188096046448, "reward_std": 0.42712050676345825, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2795758843421936, "rewards/tag_count_reward/std": 0.2769909203052521, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1413.5379638671875, "completions/mean_terminated_length": 812.17822265625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.061371264184113795, "frac_reward_zero_std": 0.0, "grad_norm": 0.10002502646311803, "kl": 0.002445220947265625, "learning_rate": 6.106382978723404e-07, "loss": 0.1126, "num_tokens": 206204630.0, "reward": 0.5719866156578064, "reward_std": 0.41618457436561584, "rewards/accuracy_reward/mean": 0.3102678656578064, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.26171875, "rewards/tag_count_reward/std": 0.28131988644599915, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1336.419677734375, "completions/mean_terminated_length": 880.2783813476562, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.061584358851419746, "frac_reward_zero_std": 0.0, "grad_norm": 0.11438909649491394, "kl": 0.00281524658203125, "learning_rate": 6.127659574468084e-07, "loss": 0.1182, "num_tokens": 206866866.0, "reward": 0.6830357313156128, "reward_std": 0.46057838201522827, "rewards/accuracy_reward/mean": 0.4027777910232544, "rewards/accuracy_reward/std": 0.4910254180431366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2946428656578064, "rewards/tag_count_reward/std": 0.27874815464019775, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1465.3817138671875, "completions/mean_terminated_length": 903.2061767578125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0617974535187257, "frac_reward_zero_std": 0.0, "grad_norm": 0.10553625637377327, "kl": 0.00267791748046875, "learning_rate": 6.148936170212766e-07, "loss": 0.1264, "num_tokens": 207586205.0, "reward": 0.6049107313156128, "reward_std": 0.4599440097808838, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2700892984867096, "rewards/tag_count_reward/std": 0.27859586477279663, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1385.8751220703125, "completions/mean_terminated_length": 915.8167724609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.06201054818603165, "frac_reward_zero_std": 0.0, "grad_norm": 0.10441304913743134, "kl": 0.002758026123046875, "learning_rate": 6.170212765957446e-07, "loss": 0.158, "num_tokens": 208275893.0, "reward": 0.6283482313156128, "reward_std": 0.4457703232765198, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2756696343421936, "rewards/tag_count_reward/std": 0.2806384563446045, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1284.3348388671875, "completions/mean_terminated_length": 752.0833740234375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0622236428533376, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.16842743455439116, "kl": 0.003795623779296875, "learning_rate": 6.191489361702127e-07, "loss": 0.1495, "num_tokens": 208921803.0, "reward": 0.656808078289032, "reward_std": 0.39747610688209534, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3041294515132904, "rewards/tag_count_reward/std": 0.2949039936065674, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1291.9285888671875, "completions/mean_terminated_length": 825.18408203125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06243673752064355, "frac_reward_zero_std": 0.0, "grad_norm": 0.11133438945561518, "kl": 0.003185272216796875, "learning_rate": 6.212765957446809e-07, "loss": 0.1226, "num_tokens": 209570507.0, "reward": 0.7243303656578064, "reward_std": 0.3983565866947174, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3091517984867096, "rewards/tag_count_reward/std": 0.29701149463653564, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1295.1585693359375, "completions/mean_terminated_length": 892.9554443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06264983218794949, "frac_reward_zero_std": 0.0, "grad_norm": 0.1263571544603017, "kl": 0.003376007080078125, "learning_rate": 6.234042553191489e-07, "loss": 0.1032, "num_tokens": 210227506.0, "reward": 0.7784598469734192, "reward_std": 0.5312626361846924, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.30078125, "rewards/tag_count_reward/std": 0.28783005475997925, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1268.5491943359375, "completions/mean_terminated_length": 880.1270751953125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06286292685525545, "frac_reward_zero_std": 0.0, "grad_norm": 0.10021198258450638, "kl": 0.0032501220703125, "learning_rate": 6.25531914893617e-07, "loss": 0.1431, "num_tokens": 210859768.0, "reward": 0.8158482313156128, "reward_std": 0.47183746099472046, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3024553656578064, "rewards/tag_count_reward/std": 0.2743399441242218, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1364.243408203125, "completions/mean_terminated_length": 905.0037231445312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.06307602152256139, "frac_reward_zero_std": 0.0, "grad_norm": 0.10904644072605166, "kl": 0.003192901611328125, "learning_rate": 6.276595744680851e-07, "loss": 0.1306, "num_tokens": 211544629.0, "reward": 0.6785714626312256, "reward_std": 0.38587620854377747, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2991071343421936, "rewards/tag_count_reward/std": 0.27291879057884216, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1412.4598388671875, "completions/mean_terminated_length": 913.6494140625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.06328911618986735, "frac_reward_zero_std": 0.0, "grad_norm": 0.1114235807426125, "kl": 0.0030364990234375, "learning_rate": 6.297872340425532e-07, "loss": 0.1057, "num_tokens": 212249091.0, "reward": 0.566964328289032, "reward_std": 0.41129270195961, "rewards/accuracy_reward/mean": 0.3058035671710968, "rewards/accuracy_reward/std": 0.4612620174884796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2611607015132904, "rewards/tag_count_reward/std": 0.28553569316864014, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1212.2857666015625, "completions/mean_terminated_length": 851.8338623046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0635022108571733, "frac_reward_zero_std": 0.0, "grad_norm": 0.12355584922860169, "kl": 0.004459381103515625, "learning_rate": 6.319148936170212e-07, "loss": 0.0972, "num_tokens": 212855411.0, "reward": 0.848214328289032, "reward_std": 0.46111035346984863, "rewards/accuracy_reward/mean": 0.48842594027519226, "rewards/accuracy_reward/std": 0.500445544719696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3772321343421936, "rewards/tag_count_reward/std": 0.3014608919620514, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1363.04248046875, "completions/mean_terminated_length": 898.7078857421875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06371530552447925, "frac_reward_zero_std": 0.0, "grad_norm": 1.6525170299487657, "kl": 0.00958251953125, "learning_rate": 6.340425531914893e-07, "loss": 0.1403, "num_tokens": 213544838.0, "reward": 0.6640625, "reward_std": 0.43487489223480225, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3046875, "rewards/tag_count_reward/std": 0.28294217586517334, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1379.247802734375, "completions/mean_terminated_length": 1018.4432983398438, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.0639284001917852, "frac_reward_zero_std": 0.0, "grad_norm": 0.10703874401396833, "kl": 0.0035552978515625, "learning_rate": 6.361702127659574e-07, "loss": 0.1367, "num_tokens": 214227429.0, "reward": 0.8532366156578064, "reward_std": 0.49103841185569763, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3577008843421936, "rewards/tag_count_reward/std": 0.2929672598838806, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 1410.7344970703125, "completions/mean_terminated_length": 887.451171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.06414149485909115, "frac_reward_zero_std": 0.0, "grad_norm": 0.1057458392485803, "kl": 0.003360748291015625, "learning_rate": 6.382978723404255e-07, "loss": 0.0901, "num_tokens": 214935198.0, "reward": 0.5853794813156128, "reward_std": 0.3734409213066101, "rewards/accuracy_reward/mean": 0.2991071343421936, "rewards/accuracy_reward/std": 0.45837870240211487, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2862723171710968, "rewards/tag_count_reward/std": 0.28905490040779114, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1351.77685546875, "completions/mean_terminated_length": 913.7891235351562, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0643545895263971, "frac_reward_zero_std": 0.0, "grad_norm": 0.11870056565425288, "kl": 0.004306793212890625, "learning_rate": 6.404255319148935e-07, "loss": 0.1716, "num_tokens": 215610442.0, "reward": 0.7053571939468384, "reward_std": 0.38001716136932373, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3370535671710968, "rewards/tag_count_reward/std": 0.30508336424827576, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1357.837158203125, "completions/mean_terminated_length": 835.4784545898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.06456768419370305, "frac_reward_zero_std": 0.0, "grad_norm": 0.1129373616142468, "kl": 0.0040740966796875, "learning_rate": 6.425531914893617e-07, "loss": 0.1299, "num_tokens": 216290385.0, "reward": 0.7142857313156128, "reward_std": 0.4183560907840729, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3191964328289032, "rewards/tag_count_reward/std": 0.3037053346633911, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1275.05810546875, "completions/mean_terminated_length": 820.0637817382812, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.064780778861009, "frac_reward_zero_std": 0.0, "grad_norm": 0.11391010831281453, "kl": 0.003643035888671875, "learning_rate": 6.446808510638297e-07, "loss": 0.121, "num_tokens": 216933899.0, "reward": 0.7042410969734192, "reward_std": 0.47441548109054565, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.29821956157684326, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1204.015625, "completions/mean_terminated_length": 820.3863525390625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.06499387352831495, "frac_reward_zero_std": 0.0, "grad_norm": 0.12531485754865115, "kl": 0.004322052001953125, "learning_rate": 6.468085106382979e-07, "loss": 0.143, "num_tokens": 217539154.0, "reward": 0.8158482313156128, "reward_std": 0.465718150138855, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.29206088185310364, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1302.4910888671875, "completions/mean_terminated_length": 829.065673828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0652069681956209, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11016940336135449, "kl": 0.003643035888671875, "learning_rate": 6.48936170212766e-07, "loss": 0.1791, "num_tokens": 218187134.0, "reward": 0.785714328289032, "reward_std": 0.46417149901390076, "rewards/accuracy_reward/mean": 0.4652777910232544, "rewards/accuracy_reward/std": 0.499371200799942, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3370535671710968, "rewards/tag_count_reward/std": 0.30324462056159973, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1309.009033203125, "completions/mean_terminated_length": 852.8086547851562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06542006286292686, "frac_reward_zero_std": 0.0, "grad_norm": 0.11202130060878548, "kl": 0.0044708251953125, "learning_rate": 6.510638297872341e-07, "loss": 0.1238, "num_tokens": 218844226.0, "reward": 0.6718750596046448, "reward_std": 0.4207690954208374, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3258928656578064, "rewards/tag_count_reward/std": 0.3039436638355255, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1346.743408203125, "completions/mean_terminated_length": 888.7269287109375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0656331575302328, "frac_reward_zero_std": 0.0, "grad_norm": 0.11314957326779536, "kl": 0.0044097900390625, "learning_rate": 6.531914893617021e-07, "loss": 0.1227, "num_tokens": 219511519.0, "reward": 0.8426339626312256, "reward_std": 0.4310063421726227, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3694196343421936, "rewards/tag_count_reward/std": 0.3046472668647766, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1244.0201416015625, "completions/mean_terminated_length": 851.3787231445312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06584625219753876, "frac_reward_zero_std": 0.0, "grad_norm": 0.13730477351699344, "kl": 0.00449371337890625, "learning_rate": 6.553191489361701e-07, "loss": 0.1094, "num_tokens": 220135016.0, "reward": 0.7511160969734192, "reward_std": 0.4742560386657715, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.30608630180358887, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1184.107177734375, "completions/mean_terminated_length": 853.4815063476562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0660593468648447, "frac_reward_zero_std": 0.0, "grad_norm": 0.10185161511489055, "kl": 0.004428863525390625, "learning_rate": 6.574468085106383e-07, "loss": 0.1334, "num_tokens": 220732424.0, "reward": 0.7907366156578064, "reward_std": 0.42039892077445984, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.3076702952384949, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1161.015625, "completions/mean_terminated_length": 778.450439453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.06627244153215066, "frac_reward_zero_std": 0.0, "grad_norm": 0.1387803492721846, "kl": 0.0057373046875, "learning_rate": 6.595744680851063e-07, "loss": 0.1336, "num_tokens": 221323167.0, "reward": 0.8320313096046448, "reward_std": 0.4504760205745697, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3610491156578064, "rewards/tag_count_reward/std": 0.30803120136260986, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1379.0491943359375, "completions/mean_terminated_length": 872.7451782226562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0664855361994566, "frac_reward_zero_std": 0.0, "grad_norm": 0.18000329121600528, "kl": 0.003940582275390625, "learning_rate": 6.617021276595744e-07, "loss": 0.1474, "num_tokens": 222012949.0, "reward": 0.7159598469734192, "reward_std": 0.44996094703674316, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3364955484867096, "rewards/tag_count_reward/std": 0.3036349415779114, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1310.2835693359375, "completions/mean_terminated_length": 771.9497680664062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06669863086676256, "frac_reward_zero_std": 0.0, "grad_norm": 0.11125159813996138, "kl": 0.004058837890625, "learning_rate": 6.638297872340425e-07, "loss": 0.1663, "num_tokens": 222672740.0, "reward": 0.7589285969734192, "reward_std": 0.44345101714134216, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3459821343421936, "rewards/tag_count_reward/std": 0.31149765849113464, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1458.26123046875, "completions/mean_terminated_length": 928.4957885742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.06691172553406852, "frac_reward_zero_std": 0.0, "grad_norm": 0.10535689333378054, "kl": 0.00356292724609375, "learning_rate": 6.659574468085106e-07, "loss": 0.1258, "num_tokens": 223403561.0, "reward": 0.7377232313156128, "reward_std": 0.48062777519226074, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3247767984867096, "rewards/tag_count_reward/std": 0.29345956444740295, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1304.10498046875, "completions/mean_terminated_length": 836.1272583007812, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.06712482020137446, "frac_reward_zero_std": 0.0, "grad_norm": 0.12201852959040368, "kl": 0.004032135009765625, "learning_rate": 6.680851063829786e-07, "loss": 0.1186, "num_tokens": 224066168.0, "reward": 0.7672991156578064, "reward_std": 0.4223143756389618, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3409598171710968, "rewards/tag_count_reward/std": 0.3050869405269623, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1344.8170166015625, "completions/mean_terminated_length": 863.6917724609375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.06733791486868042, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11422033281040551, "kl": 0.00428009033203125, "learning_rate": 6.702127659574468e-07, "loss": 0.0915, "num_tokens": 224731766.0, "reward": 0.6986607313156128, "reward_std": 0.4488019347190857, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399619221687317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.3190997540950775, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1215.1741943359375, "completions/mean_terminated_length": 852.1474609375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.06755100953598636, "frac_reward_zero_std": 0.0, "grad_norm": 0.1413236481182424, "kl": 0.005096435546875, "learning_rate": 6.723404255319149e-07, "loss": 0.1195, "num_tokens": 225343348.0, "reward": 0.8035714626312256, "reward_std": 0.4650380611419678, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3683035671710968, "rewards/tag_count_reward/std": 0.2881123721599579, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1364.2723388671875, "completions/mean_terminated_length": 954.0357055664062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06776410420329232, "frac_reward_zero_std": 0.0, "grad_norm": 0.3059060722790863, "kl": 0.00421905517578125, "learning_rate": 6.74468085106383e-07, "loss": 0.1343, "num_tokens": 226032782.0, "reward": 0.7064732313156128, "reward_std": 0.4493306875228882, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3604910671710968, "rewards/tag_count_reward/std": 0.2959672808647156, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1375.8817138671875, "completions/mean_terminated_length": 876.36962890625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.06797719887059826, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1126971288726166, "kl": 0.004131317138671875, "learning_rate": 6.76595744680851e-07, "loss": 0.1509, "num_tokens": 226724473.0, "reward": 0.754464328289032, "reward_std": 0.4341088831424713, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3169642984867096, "rewards/tag_count_reward/std": 0.30328577756881714, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1377.9688720703125, "completions/mean_terminated_length": 897.9080200195312, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.06819029353790422, "frac_reward_zero_std": 0.0, "grad_norm": 0.1047019472189945, "kl": 0.003772735595703125, "learning_rate": 6.787234042553192e-07, "loss": 0.1354, "num_tokens": 227418587.0, "reward": 0.6869419813156128, "reward_std": 0.43988627195358276, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.3067030608654022, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1506.3973388671875, "completions/mean_terminated_length": 940.0639038085938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.06840338820521016, "frac_reward_zero_std": 0.0, "grad_norm": 0.09759000097938918, "kl": 0.0033111572265625, "learning_rate": 6.808510638297872e-07, "loss": 0.1158, "num_tokens": 228164109.0, "reward": 0.5524553656578064, "reward_std": 0.4266922175884247, "rewards/accuracy_reward/mean": 0.2566964328289032, "rewards/accuracy_reward/std": 0.4372987747192383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2957589328289032, "rewards/tag_count_reward/std": 0.30857232213020325, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1134.0670166015625, "completions/mean_terminated_length": 731.4662475585938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.06861648287251612, "frac_reward_zero_std": 0.0, "grad_norm": 0.160229197618124, "kl": 0.004913330078125, "learning_rate": 6.829787234042553e-07, "loss": 0.1532, "num_tokens": 228737675.0, "reward": 0.8476563096046448, "reward_std": 0.4259325861930847, "rewards/accuracy_reward/mean": 0.44907405972480774, "rewards/accuracy_reward/std": 0.49797651171684265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4146205484867096, "rewards/tag_count_reward/std": 0.30624276399612427, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1250.727783203125, "completions/mean_terminated_length": 794.743896484375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.06882957753982206, "frac_reward_zero_std": 0.0, "grad_norm": 0.11077532234015854, "kl": 0.0046539306640625, "learning_rate": 6.851063829787234e-07, "loss": 0.124, "num_tokens": 229365537.0, "reward": 0.8275669813156128, "reward_std": 0.4649622440338135, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3699776828289032, "rewards/tag_count_reward/std": 0.3176877200603485, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1336.515625, "completions/mean_terminated_length": 876.1433715820312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.06904267220712802, "frac_reward_zero_std": 0.0, "grad_norm": 0.11537342606505271, "kl": 0.004520416259765625, "learning_rate": 6.872340425531915e-07, "loss": 0.1765, "num_tokens": 230043624.0, "reward": 0.7064732313156128, "reward_std": 0.43799006938934326, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3582589328289032, "rewards/tag_count_reward/std": 0.31150367856025696, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1431.4129638671875, "completions/mean_terminated_length": 964.7412109375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.06925576687443397, "frac_reward_zero_std": 0.0, "grad_norm": 0.13168027684807662, "kl": 0.00611114501953125, "learning_rate": 6.893617021276595e-07, "loss": 0.129, "num_tokens": 230755073.0, "reward": 0.7539063096046448, "reward_std": 0.43856027722358704, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3454241156578064, "rewards/tag_count_reward/std": 0.31009531021118164, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1439.8170166015625, "completions/mean_terminated_length": 962.4780883789062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06946886154173992, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10476827077645101, "kl": 0.0042266845703125, "learning_rate": 6.914893617021277e-07, "loss": 0.0941, "num_tokens": 231470927.0, "reward": 0.7511160969734192, "reward_std": 0.4309828281402588, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3225446343421936, "rewards/tag_count_reward/std": 0.31513744592666626, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1035.122802734375, "completions/mean_terminated_length": 847.5528564453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.06968195620904587, "frac_reward_zero_std": 0.0, "grad_norm": 0.12200688034336493, "kl": 0.00629425048828125, "learning_rate": 6.936170212765957e-07, "loss": 0.0707, "num_tokens": 231996902.0, "reward": 1.109375, "reward_std": 0.4636426270008087, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4620535671710968, "rewards/tag_count_reward/std": 0.2890813648700714, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1353.83935546875, "completions/mean_terminated_length": 896.2073974609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06989505087635182, "frac_reward_zero_std": 0.0, "grad_norm": 0.11730913789115854, "kl": 0.00429534912109375, "learning_rate": 6.957446808510637e-07, "loss": 0.1252, "num_tokens": 232674766.0, "reward": 0.7578125596046448, "reward_std": 0.48383283615112305, "rewards/accuracy_reward/mean": 0.39814814925193787, "rewards/accuracy_reward/std": 0.49008384346961975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3738839328289032, "rewards/tag_count_reward/std": 0.28964006900787354, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1484.46435546875, "completions/mean_terminated_length": 978.2373046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07010814554365777, "frac_reward_zero_std": 0.0, "grad_norm": 0.1083797455369247, "kl": 0.00372314453125, "learning_rate": 6.978723404255319e-07, "loss": 0.1588, "num_tokens": 233411614.0, "reward": 0.617745578289032, "reward_std": 0.4261285066604614, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2829241156578064, "rewards/tag_count_reward/std": 0.29614174365997314, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1264.72998046875, "completions/mean_terminated_length": 882.20263671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07032124021096373, "frac_reward_zero_std": 0.0, "grad_norm": 0.11064147007348499, "kl": 0.004802703857421875, "learning_rate": 7e-07, "loss": 0.1706, "num_tokens": 234046517.0, "reward": 0.8766741752624512, "reward_std": 0.4159274399280548, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3900669515132904, "rewards/tag_count_reward/std": 0.3084321916103363, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1418.33935546875, "completions/mean_terminated_length": 1014.7106323242188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.07053433487826967, "frac_reward_zero_std": 0.0, "grad_norm": 0.10604201192818602, "kl": 0.0043182373046875, "learning_rate": 7.021276595744681e-07, "loss": 0.099, "num_tokens": 234755725.0, "reward": 0.805245578289032, "reward_std": 0.4905674457550049, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3699776828289032, "rewards/tag_count_reward/std": 0.31724730134010315, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1340.950927734375, "completions/mean_terminated_length": 985.0537109375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.07074742954557563, "frac_reward_zero_std": 0.0, "grad_norm": 0.11485927129809907, "kl": 0.00485992431640625, "learning_rate": 7.042553191489361e-07, "loss": 0.1018, "num_tokens": 235428631.0, "reward": 0.8671875596046448, "reward_std": 0.4256546199321747, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3627232015132904, "rewards/tag_count_reward/std": 0.2884133756160736, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1325.109375, "completions/mean_terminated_length": 878.848388671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.07096052421288157, "frac_reward_zero_std": 0.0, "grad_norm": 0.11310105761655365, "kl": 0.004547119140625, "learning_rate": 7.063829787234043e-07, "loss": 0.1112, "num_tokens": 236092808.0, "reward": 0.8392857313156128, "reward_std": 0.4639196991920471, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3973214328289032, "rewards/tag_count_reward/std": 0.30616989731788635, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1354.6429443359375, "completions/mean_terminated_length": 880.2406005859375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.07117361888018753, "frac_reward_zero_std": 0.0, "grad_norm": 0.1046794921701192, "kl": 0.0046844482421875, "learning_rate": 7.085106382978723e-07, "loss": 0.1118, "num_tokens": 236762296.0, "reward": 0.8900669813156128, "reward_std": 0.4585322141647339, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3722098171710968, "rewards/tag_count_reward/std": 0.31372949481010437, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1209.38623046875, "completions/mean_terminated_length": 816.2000122070312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07138671354749347, "frac_reward_zero_std": 0.0, "grad_norm": 6.343263602137767, "kl": 0.1538238525390625, "learning_rate": 7.106382978723404e-07, "loss": 0.134, "num_tokens": 237373509.0, "reward": 0.9453125596046448, "reward_std": 0.4081028401851654, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4229910671710968, "rewards/tag_count_reward/std": 0.3100414574146271, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1225.453125, "completions/mean_terminated_length": 859.287109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.07159980821479943, "frac_reward_zero_std": 0.0, "grad_norm": 0.1169139833377291, "kl": 0.0057525634765625, "learning_rate": 7.127659574468084e-07, "loss": 0.128, "num_tokens": 237993472.0, "reward": 0.9508929252624512, "reward_std": 0.462897926568985, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4151785671710968, "rewards/tag_count_reward/std": 0.30753690004348755, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1325.4375, "completions/mean_terminated_length": 883.582763671875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07181290288210537, "frac_reward_zero_std": 0.0, "grad_norm": 0.30890027858823593, "kl": 0.00490570068359375, "learning_rate": 7.148936170212766e-07, "loss": 0.1597, "num_tokens": 238660212.0, "reward": 0.7578125596046448, "reward_std": 0.4775422215461731, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3761160671710968, "rewards/tag_count_reward/std": 0.32854312658309937, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1347.7232666015625, "completions/mean_terminated_length": 868.5864868164062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.07202599754941133, "frac_reward_zero_std": 0.0, "grad_norm": 0.10666511953622149, "kl": 0.00457000732421875, "learning_rate": 7.170212765957446e-07, "loss": 0.172, "num_tokens": 239334696.0, "reward": 0.7382813096046448, "reward_std": 0.45309746265411377, "rewards/accuracy_reward/mean": 0.3888888955116272, "rewards/accuracy_reward/std": 0.4880632162094116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.3144131898880005, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1290.578125, "completions/mean_terminated_length": 942.706787109375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.07223909221671727, "frac_reward_zero_std": 0.0, "grad_norm": 0.12307020779859715, "kl": 0.0059967041015625, "learning_rate": 7.191489361702127e-07, "loss": 0.087, "num_tokens": 239988731.0, "reward": 0.9162946939468384, "reward_std": 0.4513002038002014, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4229910671710968, "rewards/tag_count_reward/std": 0.31318238377571106, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1069.9910888671875, "completions/mean_terminated_length": 755.5280151367188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.07245218688402323, "frac_reward_zero_std": 0.0, "grad_norm": 0.1365874482826662, "kl": 0.00689697265625, "learning_rate": 7.212765957446808e-07, "loss": 0.1578, "num_tokens": 240532103.0, "reward": 0.957589328289032, "reward_std": 0.4438931941986084, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4642857015132904, "rewards/tag_count_reward/std": 0.30305516719818115, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1140.0179443359375, "completions/mean_terminated_length": 764.7949829101562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07266528155132917, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11663750926784484, "kl": 0.0072021484375, "learning_rate": 7.23404255319149e-07, "loss": 0.0926, "num_tokens": 241106927.0, "reward": 0.9190848469734192, "reward_std": 0.3914845883846283, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4391741156578064, "rewards/tag_count_reward/std": 0.32864710688591003, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1244.1273193359375, "completions/mean_terminated_length": 867.2294921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.07287837621863513, "frac_reward_zero_std": 0.0, "grad_norm": 0.37253260087848583, "kl": 0.017974853515625, "learning_rate": 7.25531914893617e-07, "loss": 0.1367, "num_tokens": 241739512.0, "reward": 0.9285714626312256, "reward_std": 0.45305758714675903, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4464285671710968, "rewards/tag_count_reward/std": 0.32110437750816345, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1344.7254638671875, "completions/mean_terminated_length": 934.6890258789062, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.07309147088594108, "frac_reward_zero_std": 0.0, "grad_norm": 0.10728719073805787, "kl": 0.005767822265625, "learning_rate": 7.276595744680852e-07, "loss": 0.145, "num_tokens": 242417309.0, "reward": 0.9626116752624512, "reward_std": 0.5431643724441528, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4402901828289032, "rewards/tag_count_reward/std": 0.3219774663448334, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1311.9285888671875, "completions/mean_terminated_length": 812.94384765625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.07330456555324703, "frac_reward_zero_std": 0.0, "grad_norm": 0.12286170755126098, "kl": 0.00467681884765625, "learning_rate": 7.297872340425532e-07, "loss": 0.1576, "num_tokens": 243083789.0, "reward": 0.727120578289032, "reward_std": 0.5145233273506165, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3722098171710968, "rewards/tag_count_reward/std": 0.3150636851787567, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1346.9129638671875, "completions/mean_terminated_length": 942.059814453125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.07351766022055298, "frac_reward_zero_std": 0.0, "grad_norm": 0.1545889246069127, "kl": 0.0072784423828125, "learning_rate": 7.319148936170212e-07, "loss": 0.1314, "num_tokens": 243755302.0, "reward": 0.8649554252624512, "reward_std": 0.4807323217391968, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4073660671710968, "rewards/tag_count_reward/std": 0.3320356011390686, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1288.618408203125, "completions/mean_terminated_length": 858.47900390625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.07373075488785893, "frac_reward_zero_std": 0.0, "grad_norm": 0.11171664756426589, "kl": 0.00605010986328125, "learning_rate": 7.340425531914893e-07, "loss": 0.1661, "num_tokens": 244403915.0, "reward": 0.9246652126312256, "reward_std": 0.47537100315093994, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43359375, "rewards/tag_count_reward/std": 0.32283321022987366, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1325.337158203125, "completions/mean_terminated_length": 879.2166137695312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07394384955516488, "frac_reward_zero_std": 0.0, "grad_norm": 0.10749304991399612, "kl": 0.00551605224609375, "learning_rate": 7.361702127659574e-07, "loss": 0.1634, "num_tokens": 245063954.0, "reward": 0.7678571939468384, "reward_std": 0.5014542937278748, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4129464328289032, "rewards/tag_count_reward/std": 0.3314428925514221, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1174.3616943359375, "completions/mean_terminated_length": 843.7230834960938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.07415694422247084, "frac_reward_zero_std": 0.0, "grad_norm": 0.12533643317460383, "kl": 0.0067596435546875, "learning_rate": 7.382978723404255e-07, "loss": 0.1334, "num_tokens": 245655924.0, "reward": 1.0150669813156128, "reward_std": 0.5282325148582458, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4614955484867096, "rewards/tag_count_reward/std": 0.3182491958141327, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1274.591552734375, "completions/mean_terminated_length": 861.4006958007812, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.07437003888977678, "frac_reward_zero_std": 0.0, "grad_norm": 0.1113577187464603, "kl": 0.00650787353515625, "learning_rate": 7.404255319148935e-07, "loss": 0.1266, "num_tokens": 246292941.0, "reward": 0.9514509439468384, "reward_std": 0.4593999981880188, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4291294515132904, "rewards/tag_count_reward/std": 0.321010559797287, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1351.509033203125, "completions/mean_terminated_length": 933.6143188476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.07458313355708274, "frac_reward_zero_std": 0.0, "grad_norm": 0.35591362041197494, "kl": 0.00732421875, "learning_rate": 7.425531914893617e-07, "loss": 0.1225, "num_tokens": 246978577.0, "reward": 0.8900669813156128, "reward_std": 0.5231499075889587, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3967633843421936, "rewards/tag_count_reward/std": 0.32005247473716736, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1220.93310546875, "completions/mean_terminated_length": 821.0927124023438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07479622822438868, "frac_reward_zero_std": 0.0, "grad_norm": 0.11183090081876049, "kl": 0.00676727294921875, "learning_rate": 7.446808510638297e-07, "loss": 0.1427, "num_tokens": 247589667.0, "reward": 0.9542410969734192, "reward_std": 0.49563080072402954, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4520089328289032, "rewards/tag_count_reward/std": 0.3263011574745178, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1262.9754638671875, "completions/mean_terminated_length": 898.6830444335938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07500932289169464, "frac_reward_zero_std": 0.0, "grad_norm": 0.11799169821100412, "kl": 0.00638580322265625, "learning_rate": 7.468085106382978e-07, "loss": 0.0927, "num_tokens": 248224008.0, "reward": 0.9508929252624512, "reward_std": 0.43274056911468506, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4620535671710968, "rewards/tag_count_reward/std": 0.3254833221435547, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1456.3326416015625, "completions/mean_terminated_length": 910.3734130859375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07522241755900058, "frac_reward_zero_std": 0.0, "grad_norm": 0.1000269961608101, "kl": 0.00495147705078125, "learning_rate": 7.489361702127658e-07, "loss": 0.1172, "num_tokens": 248945453.0, "reward": 0.7723214626312256, "reward_std": 0.4526480436325073, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3794642984867096, "rewards/tag_count_reward/std": 0.33483800292015076, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1194.634033203125, "completions/mean_terminated_length": 806.740234375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.07543551222630654, "frac_reward_zero_std": 0.0, "grad_norm": 0.15690249638555528, "kl": 0.0087890625, "learning_rate": 7.510638297872341e-07, "loss": 0.1279, "num_tokens": 249546825.0, "reward": 0.934151828289032, "reward_std": 0.4254299998283386, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4497767984867096, "rewards/tag_count_reward/std": 0.32033950090408325, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1164.5223388671875, "completions/mean_terminated_length": 852.2356567382812, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.07564860689361248, "frac_reward_zero_std": 0.0, "grad_norm": 0.1230482488458399, "kl": 0.0076751708984375, "learning_rate": 7.531914893617021e-07, "loss": 0.0735, "num_tokens": 250134483.0, "reward": 1.0412946939468384, "reward_std": 0.45647132396698, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4832589328289032, "rewards/tag_count_reward/std": 0.31373992562294006, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1340.3148193359375, "completions/mean_terminated_length": 819.1511840820312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07586170156091844, "frac_reward_zero_std": 0.0, "grad_norm": 0.10779877398978702, "kl": 0.00525665283203125, "learning_rate": 7.553191489361702e-07, "loss": 0.1419, "num_tokens": 250809264.0, "reward": 0.7650669813156128, "reward_std": 0.3833584189414978, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3833705484867096, "rewards/tag_count_reward/std": 0.3368448317050934, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1298.4598388671875, "completions/mean_terminated_length": 943.4144897460938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.07607479622822438, "frac_reward_zero_std": 0.0, "grad_norm": 0.11427046399496257, "kl": 0.00658416748046875, "learning_rate": 7.574468085106383e-07, "loss": 0.1082, "num_tokens": 251466590.0, "reward": 0.9587054252624512, "reward_std": 0.47318387031555176, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4475446343421936, "rewards/tag_count_reward/std": 0.3328317105770111, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1349.32373046875, "completions/mean_terminated_length": 866.841552734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.07628789089553034, "frac_reward_zero_std": 0.0, "grad_norm": 0.134753286835733, "kl": 0.00582122802734375, "learning_rate": 7.595744680851064e-07, "loss": 0.154, "num_tokens": 252140655.0, "reward": 0.832589328289032, "reward_std": 0.5002752542495728, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4263392984867096, "rewards/tag_count_reward/std": 0.32579004764556885, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1452.9910888671875, "completions/mean_terminated_length": 1014.8062133789062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07650098556283629, "frac_reward_zero_std": 0.0, "grad_norm": 0.11440168843888172, "kl": 0.00560760498046875, "learning_rate": 7.617021276595744e-07, "loss": 0.1012, "num_tokens": 252859771.0, "reward": 0.7996652126312256, "reward_std": 0.5459848642349243, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4246651828289032, "rewards/tag_count_reward/std": 0.31911492347717285, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1300.0223388671875, "completions/mean_terminated_length": 868.091552734375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.07671408023014224, "frac_reward_zero_std": 0.0, "grad_norm": 2.7055710484810085, "kl": 0.05333709716796875, "learning_rate": 7.638297872340426e-07, "loss": 0.0895, "num_tokens": 253506245.0, "reward": 0.8722098469734192, "reward_std": 0.4105775058269501, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4592633843421936, "rewards/tag_count_reward/std": 0.3240683078765869, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1335.087158203125, "completions/mean_terminated_length": 869.4575805664062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.07692717489744819, "frac_reward_zero_std": 0.0, "grad_norm": 0.4811192671746188, "kl": 0.0089111328125, "learning_rate": 7.659574468085106e-07, "loss": 0.1555, "num_tokens": 254170508.0, "reward": 0.7656250596046448, "reward_std": 0.4630606770515442, "rewards/accuracy_reward/mean": 0.3680555522441864, "rewards/accuracy_reward/std": 0.48283568024635315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4107142984867096, "rewards/tag_count_reward/std": 0.33042433857917786, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1341.12060546875, "completions/mean_terminated_length": 834.6589965820312, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.07714026956475414, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.7916810416441262, "kl": 0.02782440185546875, "learning_rate": 7.680851063829787e-07, "loss": 0.1184, "num_tokens": 254844706.0, "reward": 0.8225446939468384, "reward_std": 0.5005802512168884, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4185267984867096, "rewards/tag_count_reward/std": 0.3337007761001587, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1180.279052734375, "completions/mean_terminated_length": 855.549072265625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07735336423206009, "frac_reward_zero_std": 0.0, "grad_norm": 0.1488906691991433, "kl": 0.00792694091796875, "learning_rate": 7.702127659574467e-07, "loss": 0.145, "num_tokens": 255438815.0, "reward": 1.07421875, "reward_std": 0.4518907070159912, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.32445329427719116, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1201.8170166015625, "completions/mean_terminated_length": 844.5396728515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.07756645889936604, "frac_reward_zero_std": 0.0, "grad_norm": 0.11845335589646902, "kl": 0.00675201416015625, "learning_rate": 7.723404255319148e-07, "loss": 0.1603, "num_tokens": 256049661.0, "reward": 0.9363839626312256, "reward_std": 0.4685453772544861, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4654017984867096, "rewards/tag_count_reward/std": 0.31671810150146484, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1273.9129638671875, "completions/mean_terminated_length": 839.6690063476562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.07777955356667199, "frac_reward_zero_std": 0.0, "grad_norm": 0.12278831565279281, "kl": 0.006683349609375, "learning_rate": 7.744680851063829e-07, "loss": 0.1383, "num_tokens": 256690502.0, "reward": 0.902901828289032, "reward_std": 0.47695741057395935, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4631696343421936, "rewards/tag_count_reward/std": 0.3403094410896301, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1257.794677734375, "completions/mean_terminated_length": 891.0980224609375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07799264823397795, "frac_reward_zero_std": 0.0, "grad_norm": 0.13619905747048863, "kl": 0.00888824462890625, "learning_rate": 7.765957446808509e-07, "loss": 0.1136, "num_tokens": 257325002.0, "reward": 0.949776828289032, "reward_std": 0.3965776264667511, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4564732015132904, "rewards/tag_count_reward/std": 0.3200119733810425, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1344.372802734375, "completions/mean_terminated_length": 862.943603515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07820574290128389, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10920086514841651, "kl": 0.0057830810546875, "learning_rate": 7.787234042553192e-07, "loss": 0.1375, "num_tokens": 257997777.0, "reward": 0.8348214626312256, "reward_std": 0.46554961800575256, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4196428656578064, "rewards/tag_count_reward/std": 0.3310283124446869, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1367.7501220703125, "completions/mean_terminated_length": 884.8244018554688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.07841883756858985, "frac_reward_zero_std": 0.0, "grad_norm": 0.10960442022258464, "kl": 0.0053558349609375, "learning_rate": 7.808510638297872e-07, "loss": 0.1203, "num_tokens": 258676897.0, "reward": 0.9062500596046448, "reward_std": 0.45538076758384705, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4285714328289032, "rewards/tag_count_reward/std": 0.3301219344139099, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1238.4576416015625, "completions/mean_terminated_length": 870.4837646484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07863193223589579, "frac_reward_zero_std": 0.0, "grad_norm": 0.22078999102770358, "kl": 0.0058441162109375, "learning_rate": 7.829787234042553e-07, "loss": 0.1295, "num_tokens": 259307966.0, "reward": 0.9095982313156128, "reward_std": 0.4723241329193115, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4654017984867096, "rewards/tag_count_reward/std": 0.3334912359714508, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1299.2567138671875, "completions/mean_terminated_length": 875.1433715820312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07884502690320175, "frac_reward_zero_std": 0.0, "grad_norm": 0.11692593864040696, "kl": 0.0061187744140625, "learning_rate": 7.851063829787234e-07, "loss": 0.1285, "num_tokens": 259960385.0, "reward": 0.8816964626312256, "reward_std": 0.4885033965110779, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4419642984867096, "rewards/tag_count_reward/std": 0.32552167773246765, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1430.212158203125, "completions/mean_terminated_length": 909.0328979492188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07905812157050769, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11221244933418362, "kl": 0.00559234619140625, "learning_rate": 7.872340425531915e-07, "loss": 0.1137, "num_tokens": 260668080.0, "reward": 0.8727679252624512, "reward_std": 0.4812285900115967, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4263392984867096, "rewards/tag_count_reward/std": 0.33090004324913025, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1363.3951416015625, "completions/mean_terminated_length": 971.84912109375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.07927121623781365, "frac_reward_zero_std": 0.0, "grad_norm": 0.11257000777919583, "kl": 0.00624847412109375, "learning_rate": 7.893617021276595e-07, "loss": 0.1328, "num_tokens": 261349329.0, "reward": 0.9196429252624512, "reward_std": 0.5100753903388977, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4486607015132904, "rewards/tag_count_reward/std": 0.32579004764556885, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1178.2388916015625, "completions/mean_terminated_length": 795.096435546875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.07948431090511959, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15866669420654644, "kl": 0.0077972412109375, "learning_rate": 7.914893617021276e-07, "loss": 0.1657, "num_tokens": 261946972.0, "reward": 0.9241071939468384, "reward_std": 0.42977941036224365, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4709821343421936, "rewards/tag_count_reward/std": 0.3410661220550537, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1280.0535888671875, "completions/mean_terminated_length": 877.7958984375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.07969740557242555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11136107292107865, "kl": 0.00638580322265625, "learning_rate": 7.936170212765957e-07, "loss": 0.1159, "num_tokens": 262588852.0, "reward": 0.902901828289032, "reward_std": 0.47068747878074646, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4743303656578064, "rewards/tag_count_reward/std": 0.32367366552352905, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1107.930908203125, "completions/mean_terminated_length": 834.308349609375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0799105002397315, "frac_reward_zero_std": 0.0, "grad_norm": 0.12522299187898583, "kl": 0.0082550048828125, "learning_rate": 7.957446808510638e-07, "loss": 0.14, "num_tokens": 263157573.0, "reward": 1.0848214626312256, "reward_std": 0.4378872215747833, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5111607313156128, "rewards/tag_count_reward/std": 0.2956402003765106, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1278.575927734375, "completions/mean_terminated_length": 846.94775390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08012359490703745, "frac_reward_zero_std": 0.0, "grad_norm": 0.11331678560154371, "kl": 0.0064697265625, "learning_rate": 7.978723404255318e-07, "loss": 0.1201, "num_tokens": 263801735.0, "reward": 1.0212054252624512, "reward_std": 0.4924018383026123, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.34978869557380676, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1286.0513916015625, "completions/mean_terminated_length": 928.809814453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.08033668957434341, "frac_reward_zero_std": 0.0, "grad_norm": 0.11872886083967804, "kl": 0.00699615478515625, "learning_rate": 8e-07, "loss": 0.1073, "num_tokens": 264464190.0, "reward": 0.9804688096046448, "reward_std": 0.46269384026527405, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4737723171710968, "rewards/tag_count_reward/std": 0.3298339545726776, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1228.247802734375, "completions/mean_terminated_length": 777.2422485351562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08054978424164935, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11443456779144483, "kl": 0.0071868896484375, "learning_rate": 8.02127659574468e-07, "loss": 0.07, "num_tokens": 265082717.0, "reward": 1.01953125, "reward_std": 0.45115476846694946, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4771205484867096, "rewards/tag_count_reward/std": 0.34418463706970215, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1256.899658203125, "completions/mean_terminated_length": 846.5999755859375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.08076287890895531, "frac_reward_zero_std": 0.0, "grad_norm": 22.9217259742924, "kl": 0.72265625, "learning_rate": 8.042553191489362e-07, "loss": 0.1353, "num_tokens": 265718960.0, "reward": 0.9330357313156128, "reward_std": 0.44094958901405334, "rewards/accuracy_reward/mean": 0.44907405972480774, "rewards/accuracy_reward/std": 0.49797651171684265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.3083477020263672, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1213.4107666015625, "completions/mean_terminated_length": 872.2263793945312, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08097597357626125, "frac_reward_zero_std": 0.0, "grad_norm": 0.4703130841075945, "kl": 0.0083770751953125, "learning_rate": 8.063829787234043e-07, "loss": 0.1441, "num_tokens": 266324504.0, "reward": 1.0172991752624512, "reward_std": 0.49286893010139465, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5061383843421936, "rewards/tag_count_reward/std": 0.32912537455558777, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1299.75, "completions/mean_terminated_length": 815.5882568359375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.08118906824356721, "frac_reward_zero_std": 0.0, "grad_norm": 0.12186164788933354, "kl": 0.0063018798828125, "learning_rate": 8.085106382978723e-07, "loss": 0.144, "num_tokens": 266981352.0, "reward": 0.723214328289032, "reward_std": 0.4090487062931061, "rewards/accuracy_reward/mean": 0.3013392984867096, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.3311414122581482, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1177.821533203125, "completions/mean_terminated_length": 794.4951782226562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08140216291087315, "frac_reward_zero_std": 0.0, "grad_norm": 0.13122956227799018, "kl": 0.007293701171875, "learning_rate": 8.106382978723404e-07, "loss": 0.1642, "num_tokens": 267578520.0, "reward": 0.9447544813156128, "reward_std": 0.4290495216846466, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5005580186843872, "rewards/tag_count_reward/std": 0.3338218629360199, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1098.6920166015625, "completions/mean_terminated_length": 839.789794921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.08161525757817911, "frac_reward_zero_std": 0.0, "grad_norm": 0.12079459334840417, "kl": 0.00794219970703125, "learning_rate": 8.127659574468084e-07, "loss": 0.0857, "num_tokens": 268133614.0, "reward": 0.9553571939468384, "reward_std": 0.42930319905281067, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5089285969734192, "rewards/tag_count_reward/std": 0.31935787200927734, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1216.0023193359375, "completions/mean_terminated_length": 767.1237182617188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08182835224548506, "frac_reward_zero_std": 0.0, "grad_norm": 0.12150887080201064, "kl": 0.00646209716796875, "learning_rate": 8.148936170212766e-07, "loss": 0.1145, "num_tokens": 268741199.0, "reward": 0.8900669813156128, "reward_std": 0.431190550327301, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4681919515132904, "rewards/tag_count_reward/std": 0.32549819350242615, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1303.5804443359375, "completions/mean_terminated_length": 877.8245849609375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.08204144691279101, "frac_reward_zero_std": 0.0, "grad_norm": 0.12152986193302466, "kl": 0.00719451904296875, "learning_rate": 8.170212765957446e-07, "loss": 0.1204, "num_tokens": 269390627.0, "reward": 0.9084821939468384, "reward_std": 0.45585721731185913, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4598214328289032, "rewards/tag_count_reward/std": 0.3303336501121521, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1247.3929443359375, "completions/mean_terminated_length": 912.9620361328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.08225454158009696, "frac_reward_zero_std": 0.0, "grad_norm": 0.12133284329664117, "kl": 0.00803375244140625, "learning_rate": 8.191489361702127e-07, "loss": 0.1305, "num_tokens": 270012099.0, "reward": 1.0234375, "reward_std": 0.5021721720695496, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5167410969734192, "rewards/tag_count_reward/std": 0.31060466170310974, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1308.27685546875, "completions/mean_terminated_length": 935.932861328125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08246763624740291, "frac_reward_zero_std": 0.0, "grad_norm": 0.14095168509221923, "kl": 0.0076904296875, "learning_rate": 8.212765957446808e-07, "loss": 0.0976, "num_tokens": 270665391.0, "reward": 0.9006696939468384, "reward_std": 0.41965097188949585, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.34516119956970215, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 999.3616333007812, "completions/mean_terminated_length": 788.5093994140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08268073091470886, "frac_reward_zero_std": 0.0, "grad_norm": 0.1397852214011627, "kl": 0.0105133056640625, "learning_rate": 8.234042553191489e-07, "loss": 0.1621, "num_tokens": 271181553.0, "reward": 1.2020089626312256, "reward_std": 0.47391000390052795, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5904017686843872, "rewards/tag_count_reward/std": 0.3059394359588623, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1274.9085693359375, "completions/mean_terminated_length": 889.655517578125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.08289382558201482, "frac_reward_zero_std": 0.0, "grad_norm": 0.14044456729494353, "kl": 0.00778961181640625, "learning_rate": 8.255319148936169e-07, "loss": 0.1385, "num_tokens": 271826760.0, "reward": 0.8967634439468384, "reward_std": 0.47555842995643616, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.33575716614723206, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1109.7254638671875, "completions/mean_terminated_length": 800.6795043945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.08310692024932076, "frac_reward_zero_std": 0.0, "grad_norm": 0.13310501145274745, "kl": 0.00927734375, "learning_rate": 8.27659574468085e-07, "loss": 0.1175, "num_tokens": 272390493.0, "reward": 1.1428571939468384, "reward_std": 0.4624865651130676, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5558035969734192, "rewards/tag_count_reward/std": 0.3385384678840637, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1143.279052734375, "completions/mean_terminated_length": 800.876953125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.08332001491662672, "frac_reward_zero_std": 0.0, "grad_norm": 0.12331078161692945, "kl": 0.008148193359375, "learning_rate": 8.297872340425532e-07, "loss": 0.1217, "num_tokens": 272974794.0, "reward": 1.0106027126312256, "reward_std": 0.45167678594589233, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5329241156578064, "rewards/tag_count_reward/std": 0.33261173963546753, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1165.609375, "completions/mean_terminated_length": 756.1339721679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08353310958393266, "frac_reward_zero_std": 0.0, "grad_norm": 0.1276473587656498, "kl": 0.0080413818359375, "learning_rate": 8.319148936170213e-07, "loss": 0.1087, "num_tokens": 273562811.0, "reward": 0.9268973469734192, "reward_std": 0.4446876645088196, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5139508843421936, "rewards/tag_count_reward/std": 0.33810997009277344, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1237.982177734375, "completions/mean_terminated_length": 809.474365234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08374620425123862, "frac_reward_zero_std": 0.0, "grad_norm": 0.3691404829409415, "kl": 0.008453369140625, "learning_rate": 8.340425531914893e-07, "loss": 0.1421, "num_tokens": 274184147.0, "reward": 0.9921875596046448, "reward_std": 0.41376620531082153, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5301339030265808, "rewards/tag_count_reward/std": 0.32886216044425964, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1371.4285888671875, "completions/mean_terminated_length": 904.2113037109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08395929891854456, "frac_reward_zero_std": 0.0, "grad_norm": 0.10926351966410949, "kl": 0.0069122314453125, "learning_rate": 8.361702127659575e-07, "loss": 0.1233, "num_tokens": 274869747.0, "reward": 0.9408482313156128, "reward_std": 0.5249491930007935, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4698660671710968, "rewards/tag_count_reward/std": 0.34787043929100037, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1208.2054443359375, "completions/mean_terminated_length": 793.9066772460938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.08417239358585052, "frac_reward_zero_std": 0.0, "grad_norm": 0.1314812214291103, "kl": 0.00891876220703125, "learning_rate": 8.382978723404255e-07, "loss": 0.1259, "num_tokens": 275483007.0, "reward": 0.9838169813156128, "reward_std": 0.45798468589782715, "rewards/accuracy_reward/mean": 0.4513888955116272, "rewards/accuracy_reward/std": 0.49820831418037415, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5485491156578064, "rewards/tag_count_reward/std": 0.3431383967399597, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1063.75, "completions/mean_terminated_length": 754.9091186523438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08438548825315646, "frac_reward_zero_std": 0.0, "grad_norm": 0.14201382376296273, "kl": 0.009429931640625, "learning_rate": 8.404255319148936e-07, "loss": 0.0703, "num_tokens": 276025615.0, "reward": 1.0044643878936768, "reward_std": 0.45521751046180725, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5446428656578064, "rewards/tag_count_reward/std": 0.3454955816268921, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1212.52685546875, "completions/mean_terminated_length": 783.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08459858292046242, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275173509121519, "kl": 0.0084228515625, "learning_rate": 8.425531914893617e-07, "loss": 0.1178, "num_tokens": 276643339.0, "reward": 0.8956473469734192, "reward_std": 0.4494919180870056, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5139508843421936, "rewards/tag_count_reward/std": 0.3558407127857208, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1137.7879638671875, "completions/mean_terminated_length": 753.4761962890625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.08481167758776836, "frac_reward_zero_std": 0.0, "grad_norm": 0.11779748853008372, "kl": 0.009307861328125, "learning_rate": 8.446808510638298e-07, "loss": 0.157, "num_tokens": 277214300.0, "reward": 0.9871652126312256, "reward_std": 0.43661531805992126, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55859375, "rewards/tag_count_reward/std": 0.3403328061103821, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 943.2567138671875, "completions/mean_terminated_length": 742.1293334960938, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.08502477225507432, "frac_reward_zero_std": 0.0, "grad_norm": 0.1400392652122895, "kl": 0.011993408203125, "learning_rate": 8.468085106382978e-07, "loss": 0.1528, "num_tokens": 277706623.0, "reward": 1.1149554252624512, "reward_std": 0.4290737509727478, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6350446343421936, "rewards/tag_count_reward/std": 0.2985040843486786, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1140.138427734375, "completions/mean_terminated_length": 735.9935302734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08523786692238026, "frac_reward_zero_std": 0.0, "grad_norm": 0.13630200511551344, "kl": 0.00958251953125, "learning_rate": 8.489361702127658e-07, "loss": 0.1599, "num_tokens": 278286909.0, "reward": 1.0418527126312256, "reward_std": 0.444623738527298, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5530133843421936, "rewards/tag_count_reward/std": 0.3333728015422821, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1225.587158203125, "completions/mean_terminated_length": 843.9444580078125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.08545096158968622, "frac_reward_zero_std": 0.0, "grad_norm": 0.12969004505965936, "kl": 0.0097198486328125, "learning_rate": 8.51063829787234e-07, "loss": 0.2023, "num_tokens": 278903140.0, "reward": 0.914620578289032, "reward_std": 0.5368692874908447, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5530133843421936, "rewards/tag_count_reward/std": 0.3329531252384186, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1154.821533203125, "completions/mean_terminated_length": 793.6300659179688, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.08566405625699217, "frac_reward_zero_std": 0.0, "grad_norm": 0.14148337863327637, "kl": 0.010711669921875, "learning_rate": 8.53191489361702e-07, "loss": 0.2116, "num_tokens": 279488196.0, "reward": 1.015625, "reward_std": 0.5095352530479431, "rewards/accuracy_reward/mean": 0.44675925374031067, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5848214030265808, "rewards/tag_count_reward/std": 0.3173815906047821, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1362.4910888671875, "completions/mean_terminated_length": 862.2548217773438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.08587715092429812, "frac_reward_zero_std": 0.0, "grad_norm": 0.1869206375790302, "kl": 0.0090484619140625, "learning_rate": 8.553191489361702e-07, "loss": 0.1735, "num_tokens": 280171312.0, "reward": 0.9347098469734192, "reward_std": 0.4820769131183624, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5329241156578064, "rewards/tag_count_reward/std": 0.349021852016449, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1171.71875, "completions/mean_terminated_length": 797.7643432617188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08609024559160407, "frac_reward_zero_std": 0.0, "grad_norm": 0.8501290762371464, "kl": 0.0127410888671875, "learning_rate": 8.574468085106383e-07, "loss": 0.1794, "num_tokens": 280765282.0, "reward": 1.0541294813156128, "reward_std": 0.48049402236938477, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5786830186843872, "rewards/tag_count_reward/std": 0.3440576493740082, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1261.055908203125, "completions/mean_terminated_length": 895.87255859375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.08630334025891002, "frac_reward_zero_std": 0.0, "grad_norm": 0.11630979067192901, "kl": 0.0093841552734375, "learning_rate": 8.595744680851064e-07, "loss": 0.1694, "num_tokens": 281403435.0, "reward": 0.9893973469734192, "reward_std": 0.45581328868865967, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5719866156578064, "rewards/tag_count_reward/std": 0.3414534330368042, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1240.622802734375, "completions/mean_terminated_length": 677.9053344726562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08651643492621597, "frac_reward_zero_std": 0.0, "grad_norm": 0.19638996721638072, "kl": 0.0095977783203125, "learning_rate": 8.617021276595744e-07, "loss": 0.1741, "num_tokens": 282027266.0, "reward": 0.8750000596046448, "reward_std": 0.48078683018684387, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5267857313156128, "rewards/tag_count_reward/std": 0.3469379246234894, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1002.6875610351562, "completions/mean_terminated_length": 713.8119506835938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08672952959352193, "frac_reward_zero_std": 0.0, "grad_norm": 0.13872811441735106, "kl": 0.0122833251953125, "learning_rate": 8.638297872340426e-07, "loss": 0.1853, "num_tokens": 282543286.0, "reward": 1.1763393878936768, "reward_std": 0.4613015651702881, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.6383928656578064, "rewards/tag_count_reward/std": 0.31612035632133484, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1293.90185546875, "completions/mean_terminated_length": 898.89794921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.08694262426082787, "frac_reward_zero_std": 0.0, "grad_norm": 0.11885463751958361, "kl": 0.0086822509765625, "learning_rate": 8.659574468085106e-07, "loss": 0.139, "num_tokens": 283200762.0, "reward": 0.8621652126312256, "reward_std": 0.4854860305786133, "rewards/accuracy_reward/mean": 0.3147321343421936, "rewards/accuracy_reward/std": 0.4649282693862915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5474330186843872, "rewards/tag_count_reward/std": 0.3325366675853729, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1270.34375, "completions/mean_terminated_length": 821.274658203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.08715571892813383, "frac_reward_zero_std": 0.0, "grad_norm": 0.11533406625901478, "kl": 0.0095672607421875, "learning_rate": 8.680851063829787e-07, "loss": 0.1466, "num_tokens": 283840100.0, "reward": 1.0100446939468384, "reward_std": 0.48490050435066223, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5591517686843872, "rewards/tag_count_reward/std": 0.3433043658733368, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1072.841552734375, "completions/mean_terminated_length": 747.7886962890625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08736881359543977, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1545740167944373, "kl": 0.01397705078125, "learning_rate": 8.702127659574467e-07, "loss": 0.075, "num_tokens": 284386573.0, "reward": 1.1568081378936768, "reward_std": 0.3985455632209778, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6077008843421936, "rewards/tag_count_reward/std": 0.32978853583335876, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1160.9263916015625, "completions/mean_terminated_length": 790.3765869140625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.08758190826274573, "frac_reward_zero_std": 0.0, "grad_norm": 0.12193730660863826, "kl": 0.0103912353515625, "learning_rate": 8.723404255319149e-07, "loss": 0.1217, "num_tokens": 284977004.0, "reward": 0.9720982313156128, "reward_std": 0.5002922415733337, "rewards/accuracy_reward/mean": 0.43981480598449707, "rewards/accuracy_reward/std": 0.496940016746521, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5479910969734192, "rewards/tag_count_reward/std": 0.34260547161102295, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1172.357177734375, "completions/mean_terminated_length": 744.7175903320312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08779500293005167, "frac_reward_zero_std": 0.0, "grad_norm": 0.12197006773415212, "kl": 0.009246826171875, "learning_rate": 8.744680851063829e-07, "loss": 0.1893, "num_tokens": 285570876.0, "reward": 0.906808078289032, "reward_std": 0.45161154866218567, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5474330186843872, "rewards/tag_count_reward/std": 0.32830509543418884, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1158.34375, "completions/mean_terminated_length": 790.6940307617188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.08800809759735763, "frac_reward_zero_std": 0.0, "grad_norm": 0.12373033503104719, "kl": 0.009521484375, "learning_rate": 8.76595744680851e-07, "loss": 0.132, "num_tokens": 286169446.0, "reward": 0.9871652126312256, "reward_std": 0.4302607476711273, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5943080186843872, "rewards/tag_count_reward/std": 0.3309297561645508, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1150.841552734375, "completions/mean_terminated_length": 759.7724609375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.08822119226466357, "frac_reward_zero_std": 0.0, "grad_norm": 0.12886624788073314, "kl": 0.009368896484375, "learning_rate": 8.787234042553191e-07, "loss": 0.2051, "num_tokens": 286756927.0, "reward": 1.01171875, "reward_std": 0.45868971943855286, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.3219386637210846, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1145.404052734375, "completions/mean_terminated_length": 826.3594970703125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.08843428693196953, "frac_reward_zero_std": 0.0, "grad_norm": 0.47207680286426645, "kl": 0.0138397216796875, "learning_rate": 8.808510638297872e-07, "loss": 0.1608, "num_tokens": 287335924.0, "reward": 1.1696429252624512, "reward_std": 0.46191391348838806, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6116071343421936, "rewards/tag_count_reward/std": 0.32527613639831543, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1378.9398193359375, "completions/mean_terminated_length": 958.0399780273438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08864738159927547, "frac_reward_zero_std": 0.0, "grad_norm": 0.10072030654047953, "kl": 0.0084075927734375, "learning_rate": 8.829787234042553e-07, "loss": 0.095, "num_tokens": 288023257.0, "reward": 0.9553571939468384, "reward_std": 0.457049161195755, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5602678656578064, "rewards/tag_count_reward/std": 0.33023539185523987, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1267.07373046875, "completions/mean_terminated_length": 862.0508422851562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08886047626658143, "frac_reward_zero_std": 0.0, "grad_norm": 0.14592557482538301, "kl": 0.01007080078125, "learning_rate": 8.851063829787234e-07, "loss": 0.1525, "num_tokens": 288667178.0, "reward": 0.9698660969734192, "reward_std": 0.48869234323501587, "rewards/accuracy_reward/mean": 0.43518519401550293, "rewards/accuracy_reward/std": 0.4963560700416565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5502232313156128, "rewards/tag_count_reward/std": 0.34269291162490845, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1056.7567138671875, "completions/mean_terminated_length": 807.5614013671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08907357093388737, "frac_reward_zero_std": 0.0, "grad_norm": 0.1344909011669105, "kl": 0.0123443603515625, "learning_rate": 8.872340425531915e-07, "loss": 0.1524, "num_tokens": 289206541.0, "reward": 1.1969866752624512, "reward_std": 0.4307486116886139, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6590401530265808, "rewards/tag_count_reward/std": 0.30918413400650024, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1324.7076416015625, "completions/mean_terminated_length": 907.0316772460938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.08928666560119333, "frac_reward_zero_std": 0.0, "grad_norm": 0.15712268060789628, "kl": 0.0111846923828125, "learning_rate": 8.893617021276595e-07, "loss": 0.1069, "num_tokens": 289867674.0, "reward": 0.9095982313156128, "reward_std": 0.5182147026062012, "rewards/accuracy_reward/mean": 0.3504464328289032, "rewards/accuracy_reward/std": 0.47764310240745544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5591517686843872, "rewards/tag_count_reward/std": 0.35135555267333984, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1142.4710693359375, "completions/mean_terminated_length": 743.5723266601562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.08949976026849928, "frac_reward_zero_std": 0.0, "grad_norm": 0.12941238973364502, "kl": 0.011871337890625, "learning_rate": 8.914893617021276e-07, "loss": 0.1273, "num_tokens": 290454685.0, "reward": 1.020647406578064, "reward_std": 0.44607022404670715, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6032366156578064, "rewards/tag_count_reward/std": 0.3295234441757202, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1242.9285888671875, "completions/mean_terminated_length": 938.239990234375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08971285493580523, "frac_reward_zero_std": 0.0, "grad_norm": 0.12109964315197068, "kl": 0.01214599609375, "learning_rate": 8.936170212765957e-07, "loss": 0.1435, "num_tokens": 291081229.0, "reward": 1.1824777126312256, "reward_std": 0.49143338203430176, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6311383843421936, "rewards/tag_count_reward/std": 0.3454987406730652, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1093.810302734375, "completions/mean_terminated_length": 752.6151123046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.08992594960311118, "frac_reward_zero_std": 0.0, "grad_norm": 0.4082328573676096, "kl": 0.0132904052734375, "learning_rate": 8.957446808510638e-07, "loss": 0.1402, "num_tokens": 291639992.0, "reward": 1.223772406578064, "reward_std": 0.4036027491092682, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.32790935039520264, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1170.25, "completions/mean_terminated_length": 807.5205078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.09013904427041713, "frac_reward_zero_std": 0.0, "grad_norm": 0.13500643030383605, "kl": 0.0111236572265625, "learning_rate": 8.978723404255318e-07, "loss": 0.179, "num_tokens": 292243736.0, "reward": 1.0535714626312256, "reward_std": 0.46681398153305054, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5825892686843872, "rewards/tag_count_reward/std": 0.33509889245033264, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1269.2545166015625, "completions/mean_terminated_length": 900.375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.09035213893772308, "frac_reward_zero_std": 0.0, "grad_norm": 0.12209733702831041, "kl": 0.010467529296875, "learning_rate": 9e-07, "loss": 0.1315, "num_tokens": 292878394.0, "reward": 1.0150669813156128, "reward_std": 0.45642924308776855, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5641741156578064, "rewards/tag_count_reward/std": 0.3442317843437195, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1259.69873046875, "completions/mean_terminated_length": 838.5513916015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09056523360502904, "frac_reward_zero_std": 0.0, "grad_norm": 0.1176347988210879, "kl": 0.01043701171875, "learning_rate": 9.02127659574468e-07, "loss": 0.1184, "num_tokens": 293511731.0, "reward": 0.9235491752624512, "reward_std": 0.42911988496780396, "rewards/accuracy_reward/mean": 0.34490740299224854, "rewards/accuracy_reward/std": 0.4758892059326172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5909598469734192, "rewards/tag_count_reward/std": 0.33312928676605225, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1104.65185546875, "completions/mean_terminated_length": 840.5142822265625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09077832827233498, "frac_reward_zero_std": 0.0, "grad_norm": 0.4159101694611062, "kl": 0.0137939453125, "learning_rate": 9.042553191489361e-07, "loss": 0.1367, "num_tokens": 294079319.0, "reward": 1.2299107313156128, "reward_std": 0.44114354252815247, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6428571343421936, "rewards/tag_count_reward/std": 0.3259202539920807, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1154.375, "completions/mean_terminated_length": 796.9249877929688, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.09099142293964094, "frac_reward_zero_std": 0.0, "grad_norm": 0.5289366488765045, "kl": 0.014892578125, "learning_rate": 9.063829787234041e-07, "loss": 0.0995, "num_tokens": 294667391.0, "reward": 1.0920759439468384, "reward_std": 0.42149776220321655, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6032366156578064, "rewards/tag_count_reward/std": 0.32136034965515137, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1220.962158203125, "completions/mean_terminated_length": 783.4505004882812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.09120451760694688, "frac_reward_zero_std": 0.0, "grad_norm": 0.12314840787364822, "kl": 0.0100250244140625, "learning_rate": 9.085106382978724e-07, "loss": 0.1711, "num_tokens": 295292734.0, "reward": 1.0703125, "reward_std": 0.4780718684196472, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5881696343421936, "rewards/tag_count_reward/std": 0.3520229160785675, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1210.72998046875, "completions/mean_terminated_length": 805.9569702148438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.09141761227425284, "frac_reward_zero_std": 0.0, "grad_norm": 0.11447318591624865, "kl": 0.0113983154296875, "learning_rate": 9.106382978723404e-07, "loss": 0.1264, "num_tokens": 295900133.0, "reward": 1.1227679252624512, "reward_std": 0.4260932505130768, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6116071343421936, "rewards/tag_count_reward/std": 0.3370972275733948, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1145.3638916015625, "completions/mean_terminated_length": 840.8925170898438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.09163070694155878, "frac_reward_zero_std": 0.0, "grad_norm": 0.12672488335703755, "kl": 0.0112457275390625, "learning_rate": 9.127659574468085e-07, "loss": 0.118, "num_tokens": 296480952.0, "reward": 1.1060268878936768, "reward_std": 0.4731019139289856, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6149553656578064, "rewards/tag_count_reward/std": 0.3241053819656372, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1232.7545166015625, "completions/mean_terminated_length": 854.4379272460938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.09184380160886474, "frac_reward_zero_std": 0.0, "grad_norm": 0.9810241276347809, "kl": 0.0146026611328125, "learning_rate": 9.148936170212766e-07, "loss": 0.1508, "num_tokens": 297098074.0, "reward": 1.1010044813156128, "reward_std": 0.4596308171749115, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6277901530265808, "rewards/tag_count_reward/std": 0.3361065089702606, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1212.122802734375, "completions/mean_terminated_length": 791.3792114257812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09205689627617068, "frac_reward_zero_std": 0.0, "grad_norm": 0.1291479991254237, "kl": 0.0109710693359375, "learning_rate": 9.170212765957447e-07, "loss": 0.1452, "num_tokens": 297712801.0, "reward": 1.0580357313156128, "reward_std": 0.44149741530418396, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5758928656578064, "rewards/tag_count_reward/std": 0.35051780939102173, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1124.62060546875, "completions/mean_terminated_length": 650.4526977539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.09226999094347664, "frac_reward_zero_std": 0.0, "grad_norm": 0.14046931841678065, "kl": 0.0123443603515625, "learning_rate": 9.191489361702127e-07, "loss": 0.1474, "num_tokens": 298284343.0, "reward": 1.01953125, "reward_std": 0.4352644979953766, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6177455186843872, "rewards/tag_count_reward/std": 0.33016306161880493, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1054.453125, "completions/mean_terminated_length": 801.1961059570312, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.09248308561078258, "frac_reward_zero_std": 0.0, "grad_norm": 0.13874348920062532, "kl": 0.0171661376953125, "learning_rate": 9.212765957446809e-07, "loss": 0.1568, "num_tokens": 298826514.0, "reward": 1.2566964626312256, "reward_std": 0.3902831971645355, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.3065285086631775, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1261.3660888671875, "completions/mean_terminated_length": 836.9622192382812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09269618027808854, "frac_reward_zero_std": 0.0, "grad_norm": 0.1164956345337005, "kl": 0.0110015869140625, "learning_rate": 9.234042553191489e-07, "loss": 0.1307, "num_tokens": 299457750.0, "reward": 0.9972098469734192, "reward_std": 0.46054917573928833, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.348928838968277, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1256.77685546875, "completions/mean_terminated_length": 842.3265380859375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.09290927494539448, "frac_reward_zero_std": 0.0, "grad_norm": 4.228366832954742, "kl": 0.0327606201171875, "learning_rate": 9.255319148936169e-07, "loss": 0.1244, "num_tokens": 300097730.0, "reward": 1.0602679252624512, "reward_std": 0.5038176774978638, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6026785969734192, "rewards/tag_count_reward/std": 0.35085955262184143, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1191.743408203125, "completions/mean_terminated_length": 692.5123901367188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09312236961270044, "frac_reward_zero_std": 0.0, "grad_norm": 0.11963888583713599, "kl": 0.009979248046875, "learning_rate": 9.27659574468085e-07, "loss": 0.1469, "num_tokens": 300704015.0, "reward": 0.9648438096046448, "reward_std": 0.3931638300418854, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5965401530265808, "rewards/tag_count_reward/std": 0.33155161142349243, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1266.8013916015625, "completions/mean_terminated_length": 815.6865844726562, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.09333546428000639, "frac_reward_zero_std": 0.0, "grad_norm": 0.11927082869488527, "kl": 0.012115478515625, "learning_rate": 9.297872340425531e-07, "loss": 0.1345, "num_tokens": 301341686.0, "reward": 1.0904018878936768, "reward_std": 0.4178585410118103, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6082589030265808, "rewards/tag_count_reward/std": 0.3570931553840637, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1063.294677734375, "completions/mean_terminated_length": 742.8284301757812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09354855894731234, "frac_reward_zero_std": 0.0, "grad_norm": 0.14369039189530988, "kl": 0.01312255859375, "learning_rate": 9.319148936170212e-07, "loss": 0.1522, "num_tokens": 301888826.0, "reward": 1.1127232313156128, "reward_std": 0.38257989287376404, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6551339030265808, "rewards/tag_count_reward/std": 0.3154067397117615, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1170.6585693359375, "completions/mean_terminated_length": 811.996826171875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0937616536146183, "frac_reward_zero_std": 0.0, "grad_norm": 0.30689896878613543, "kl": 0.0345458984375, "learning_rate": 9.340425531914892e-07, "loss": 0.1494, "num_tokens": 302486081.0, "reward": 1.1177456378936768, "reward_std": 0.440822958946228, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6735491156578064, "rewards/tag_count_reward/std": 0.3244994580745697, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1320.1295166015625, "completions/mean_terminated_length": 862.2327270507812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09397474828192424, "frac_reward_zero_std": 0.0, "grad_norm": 0.12317266904399936, "kl": 0.01263427734375, "learning_rate": 9.361702127659575e-07, "loss": 0.1497, "num_tokens": 303149531.0, "reward": 0.9748884439468384, "reward_std": 0.5206019282341003, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5909598469734192, "rewards/tag_count_reward/std": 0.3462999761104584, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1245.7545166015625, "completions/mean_terminated_length": 910.6392822265625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0941878429492302, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287317155890944, "kl": 0.01409912109375, "learning_rate": 9.382978723404255e-07, "loss": 0.0887, "num_tokens": 303771645.0, "reward": 1.140625, "reward_std": 0.49231529235839844, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6741071343421936, "rewards/tag_count_reward/std": 0.3205128610134125, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1065.2210693359375, "completions/mean_terminated_length": 790.0428466796875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.09440093761653615, "frac_reward_zero_std": 0.0, "grad_norm": 0.12968192211795326, "kl": 0.013031005859375, "learning_rate": 9.404255319148936e-07, "loss": 0.0749, "num_tokens": 304320176.0, "reward": 1.1891741752624512, "reward_std": 0.41448912024497986, "rewards/accuracy_reward/mean": 0.5069444179534912, "rewards/accuracy_reward/std": 0.5005314350128174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7003348469734192, "rewards/tag_count_reward/std": 0.31977149844169617, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1014.4397583007812, "completions/mean_terminated_length": 765.3545532226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0946140322838421, "frac_reward_zero_std": 0.0, "grad_norm": 0.14759827614171392, "kl": 0.0149688720703125, "learning_rate": 9.425531914893617e-07, "loss": 0.1356, "num_tokens": 304850117.0, "reward": 1.2293527126312256, "reward_std": 0.38068321347236633, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6981026530265808, "rewards/tag_count_reward/std": 0.31941601634025574, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1122.930908203125, "completions/mean_terminated_length": 807.1886596679688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09482712695114805, "frac_reward_zero_std": 0.0, "grad_norm": 0.7748710603056193, "kl": 0.01385498046875, "learning_rate": 9.446808510638298e-07, "loss": 0.1483, "num_tokens": 305424550.0, "reward": 1.1696429252624512, "reward_std": 0.48740342259407043, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6763392686843872, "rewards/tag_count_reward/std": 0.32146963477134705, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1131.49560546875, "completions/mean_terminated_length": 756.8238525390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.095040221618454, "frac_reward_zero_std": 0.0, "grad_norm": 0.13267983887980828, "kl": 0.011871337890625, "learning_rate": 9.468085106382978e-07, "loss": 0.1591, "num_tokens": 306001844.0, "reward": 1.0390625, "reward_std": 0.423543781042099, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6462053656578064, "rewards/tag_count_reward/std": 0.34123262763023376, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1163.99560546875, "completions/mean_terminated_length": 883.1941528320312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.09525331628575995, "frac_reward_zero_std": 0.0, "grad_norm": 0.12894255539075125, "kl": 0.0131988525390625, "learning_rate": 9.489361702127659e-07, "loss": 0.1404, "num_tokens": 306595266.0, "reward": 1.2405134439468384, "reward_std": 0.46087321639060974, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6802455186843872, "rewards/tag_count_reward/std": 0.3229917287826538, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1079.9442138671875, "completions/mean_terminated_length": 701.1397705078125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0954664109530659, "frac_reward_zero_std": 0.0, "grad_norm": 0.13842151229941713, "kl": 0.0139923095703125, "learning_rate": 9.51063829787234e-07, "loss": 0.1134, "num_tokens": 307150633.0, "reward": 1.1908482313156128, "reward_std": 0.38598817586898804, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6662946343421936, "rewards/tag_count_reward/std": 0.3194340765476227, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1297.8192138671875, "completions/mean_terminated_length": 885.0899658203125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.09567950562037185, "frac_reward_zero_std": 0.0, "grad_norm": 0.12584569530561895, "kl": 0.0123138427734375, "learning_rate": 9.531914893617021e-07, "loss": 0.1527, "num_tokens": 307801240.0, "reward": 1.0100446939468384, "reward_std": 0.4523391127586365, "rewards/accuracy_reward/mean": 0.39120370149612427, "rewards/accuracy_reward/std": 0.4885856807231903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.34424853324890137, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1079.4732666015625, "completions/mean_terminated_length": 793.9537353515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0958926002876778, "frac_reward_zero_std": 0.0, "grad_norm": 0.14007582337384927, "kl": 0.0145263671875, "learning_rate": 9.553191489361702e-07, "loss": 0.1058, "num_tokens": 308358556.0, "reward": 1.1997768878936768, "reward_std": 0.42876285314559937, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6752232313156128, "rewards/tag_count_reward/std": 0.30605366826057434, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1064.977783203125, "completions/mean_terminated_length": 803.9491577148438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09610569495498375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1415933337491164, "kl": 0.01531982421875, "learning_rate": 9.574468085106384e-07, "loss": 0.1308, "num_tokens": 308907538.0, "reward": 1.30859375, "reward_std": 0.4190255403518677, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035415053367615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7081473469734192, "rewards/tag_count_reward/std": 0.31428611278533936, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1154.0201416015625, "completions/mean_terminated_length": 852.4686889648438, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.09631878962228971, "frac_reward_zero_std": 0.0, "grad_norm": 0.11258313246598783, "kl": 0.013702392578125, "learning_rate": 9.595744680851063e-07, "loss": 0.1297, "num_tokens": 309486011.0, "reward": 1.1305804252624512, "reward_std": 0.47361451387405396, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.32834547758102417, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1150.5648193359375, "completions/mean_terminated_length": 829.6636352539062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09653188428959565, "frac_reward_zero_std": 0.0, "grad_norm": 0.3226003479524171, "kl": 0.0183868408203125, "learning_rate": 9.617021276595744e-07, "loss": 0.1645, "num_tokens": 310072664.0, "reward": 1.0920759439468384, "reward_std": 0.42832887172698975, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6612723469734192, "rewards/tag_count_reward/std": 0.33540377020835876, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1138.0357666015625, "completions/mean_terminated_length": 801.3211059570312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.09674497895690161, "frac_reward_zero_std": 0.0, "grad_norm": 0.1297357108723474, "kl": 0.012603759765625, "learning_rate": 9.638297872340426e-07, "loss": 0.1127, "num_tokens": 310656568.0, "reward": 1.0809152126312256, "reward_std": 0.45041897892951965, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6188616156578064, "rewards/tag_count_reward/std": 0.3434693217277527, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1274.9129638671875, "completions/mean_terminated_length": 901.1688842773438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.09695807362420755, "frac_reward_zero_std": 0.0, "grad_norm": 0.11556313160924528, "kl": 0.010711669921875, "learning_rate": 9.659574468085105e-07, "loss": 0.176, "num_tokens": 311300625.0, "reward": 0.9012277126312256, "reward_std": 0.435077428817749, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45639166235923767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6065848469734192, "rewards/tag_count_reward/std": 0.3347778618335724, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1185.453125, "completions/mean_terminated_length": 859.0123291015625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09717116829151351, "frac_reward_zero_std": 0.0, "grad_norm": 0.7537882718330753, "kl": 0.0487060546875, "learning_rate": 9.680851063829786e-07, "loss": 0.1028, "num_tokens": 311899996.0, "reward": 1.090959906578064, "reward_std": 0.41947564482688904, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6646205186843872, "rewards/tag_count_reward/std": 0.3200836777687073, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1137.2879638671875, "completions/mean_terminated_length": 776.9750366210938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.09738426295881945, "frac_reward_zero_std": 0.0, "grad_norm": 0.1380195687610888, "kl": 0.0118865966796875, "learning_rate": 9.702127659574467e-07, "loss": 0.0913, "num_tokens": 312474557.0, "reward": 1.079241156578064, "reward_std": 0.4124201536178589, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6439732313156128, "rewards/tag_count_reward/std": 0.31935593485832214, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1189.544677734375, "completions/mean_terminated_length": 807.3935546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09759735762612541, "frac_reward_zero_std": 0.0, "grad_norm": 0.12936722184663546, "kl": 0.012237548828125, "learning_rate": 9.723404255319149e-07, "loss": 0.1174, "num_tokens": 313077569.0, "reward": 1.14453125, "reward_std": 0.4015459716320038, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6311383843421936, "rewards/tag_count_reward/std": 0.3339751660823822, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1251.747802734375, "completions/mean_terminated_length": 874.57568359375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09781045229343135, "frac_reward_zero_std": 0.0, "grad_norm": 0.12649023922652772, "kl": 0.011260986328125, "learning_rate": 9.74468085106383e-07, "loss": 0.1283, "num_tokens": 313714016.0, "reward": 1.046875, "reward_std": 0.4707343876361847, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.328119158744812, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1210.9442138671875, "completions/mean_terminated_length": 883.400634765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09802354696073731, "frac_reward_zero_std": 0.0, "grad_norm": 0.12754187424830113, "kl": 0.0118865966796875, "learning_rate": 9.765957446808511e-07, "loss": 0.1364, "num_tokens": 314327303.0, "reward": 1.0770089626312256, "reward_std": 0.48893120884895325, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.494759202003479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6529017686843872, "rewards/tag_count_reward/std": 0.3133895695209503, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1040.7366943359375, "completions/mean_terminated_length": 783.9832153320312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.09823664162804326, "frac_reward_zero_std": 0.0, "grad_norm": 0.14569327225511766, "kl": 0.0135345458984375, "learning_rate": 9.78723404255319e-07, "loss": 0.1423, "num_tokens": 314860753.0, "reward": 1.1852679252624512, "reward_std": 0.4032217562198639, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6808035969734192, "rewards/tag_count_reward/std": 0.322462260723114, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1013.88623046875, "completions/mean_terminated_length": 750.2885131835938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.09844973629534921, "frac_reward_zero_std": 0.0, "grad_norm": 0.15114678144959096, "kl": 0.0137481689453125, "learning_rate": 9.808510638297872e-07, "loss": 0.191, "num_tokens": 315383966.0, "reward": 1.1852679252624512, "reward_std": 0.45902127027511597, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6852678656578064, "rewards/tag_count_reward/std": 0.3373563885688782, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1179.6116943359375, "completions/mean_terminated_length": 869.096923828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.09866283096265516, "frac_reward_zero_std": 0.0, "grad_norm": 0.1315273401210009, "kl": 0.012359619140625, "learning_rate": 9.829787234042553e-07, "loss": 0.1383, "num_tokens": 315984944.0, "reward": 1.078125, "reward_std": 0.4403482973575592, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6316964030265808, "rewards/tag_count_reward/std": 0.31456056237220764, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1047.1473388671875, "completions/mean_terminated_length": 792.0280151367188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.09887592562996111, "frac_reward_zero_std": 0.0, "grad_norm": 0.3075517367740281, "kl": 0.0149078369140625, "learning_rate": 9.851063829787235e-07, "loss": 0.1517, "num_tokens": 316521746.0, "reward": 1.1205357313156128, "reward_std": 0.4365750253200531, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6897321343421936, "rewards/tag_count_reward/std": 0.31328800320625305, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1112.5804443359375, "completions/mean_terminated_length": 800.7738037109375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09908902029726706, "frac_reward_zero_std": 0.0, "grad_norm": 0.12263570237272706, "kl": 0.01287841796875, "learning_rate": 9.872340425531914e-07, "loss": 0.0933, "num_tokens": 317084486.0, "reward": 1.1238839626312256, "reward_std": 0.40685784816741943, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.3129750192165375, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1125.5826416015625, "completions/mean_terminated_length": 776.4830932617188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09930211496457302, "frac_reward_zero_std": 0.0, "grad_norm": 0.26723573147817115, "kl": 0.0135955810546875, "learning_rate": 9.893617021276595e-07, "loss": 0.1407, "num_tokens": 317654155.0, "reward": 1.203125, "reward_std": 0.447807639837265, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6897321343421936, "rewards/tag_count_reward/std": 0.32034924626350403, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1260.8125, "completions/mean_terminated_length": 924.8789672851562, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.09951520963187896, "frac_reward_zero_std": 0.0, "grad_norm": 0.11900629784858324, "kl": 0.0116119384765625, "learning_rate": 9.914893617021276e-07, "loss": 0.1267, "num_tokens": 318304967.0, "reward": 0.9441964626312256, "reward_std": 0.39277732372283936, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839529275894165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6205357313156128, "rewards/tag_count_reward/std": 0.31769606471061707, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1018.685302734375, "completions/mean_terminated_length": 726.7020263671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.09972830429918492, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16530537406241555, "kl": 0.015228271484375, "learning_rate": 9.936170212765958e-07, "loss": 0.1576, "num_tokens": 318829402.0, "reward": 1.1969866752624512, "reward_std": 0.38769274950027466, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7036830186843872, "rewards/tag_count_reward/std": 0.3141033351421356, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1207.7054443359375, "completions/mean_terminated_length": 845.2779541015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.09994139896649086, "frac_reward_zero_std": 0.0, "grad_norm": 0.13483532352460165, "kl": 0.01446533203125, "learning_rate": 9.957446808510637e-07, "loss": 0.1468, "num_tokens": 319440150.0, "reward": 1.0524554252624512, "reward_std": 0.4234815537929535, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6216517686843872, "rewards/tag_count_reward/std": 0.3365171253681183, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1105.555908203125, "completions/mean_terminated_length": 736.7733154296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.10015449363379682, "frac_reward_zero_std": 0.0, "grad_norm": 0.1638360128156321, "kl": 0.013336181640625, "learning_rate": 9.978723404255318e-07, "loss": 0.1397, "num_tokens": 320007855.0, "reward": 1.1138393878936768, "reward_std": 0.39371177554130554, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6584821343421936, "rewards/tag_count_reward/std": 0.3418850302696228, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1159.078125, "completions/mean_terminated_length": 869.7840576171875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.10036758830110276, "frac_reward_zero_std": 0.0, "grad_norm": 0.12421842783563647, "kl": 0.01348876953125, "learning_rate": 1e-06, "loss": 0.1133, "num_tokens": 320601650.0, "reward": 1.215959906578064, "reward_std": 0.42418786883354187, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6936383843421936, "rewards/tag_count_reward/std": 0.3133472502231598, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1060.8951416015625, "completions/mean_terminated_length": 871.875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10058068296840872, "frac_reward_zero_std": 0.0, "grad_norm": 0.3041596854087582, "kl": 0.016754150390625, "learning_rate": 9.999998754797222e-07, "loss": 0.1243, "num_tokens": 321145795.0, "reward": 1.340959906578064, "reward_std": 0.45829442143440247, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7806919813156128, "rewards/tag_count_reward/std": 0.27228620648384094, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1109.046875, "completions/mean_terminated_length": 749.6944580078125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.10079377763571466, "frac_reward_zero_std": 0.0, "grad_norm": 0.13875230681146303, "kl": 0.0123748779296875, "learning_rate": 9.99999501918958e-07, "loss": 0.0762, "num_tokens": 321712344.0, "reward": 1.1188616752624512, "reward_std": 0.41439902782440186, "rewards/accuracy_reward/mean": 0.47685185074806213, "rewards/accuracy_reward/std": 0.5000429749488831, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6590401530265808, "rewards/tag_count_reward/std": 0.31766021251678467, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1181.046875, "completions/mean_terminated_length": 856.604248046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.10100687230302062, "frac_reward_zero_std": 0.0, "grad_norm": 0.1323363674377381, "kl": 0.01513671875, "learning_rate": 9.999988793179141e-07, "loss": 0.1139, "num_tokens": 322309965.0, "reward": 1.2170759439468384, "reward_std": 0.47222140431404114, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6969866156578064, "rewards/tag_count_reward/std": 0.32444560527801514, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1142.9866943359375, "completions/mean_terminated_length": 855.5117797851562, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.10121996697032656, "frac_reward_zero_std": 0.0, "grad_norm": 0.12387421477348093, "kl": 0.013519287109375, "learning_rate": 9.999980076769348e-07, "loss": 0.1511, "num_tokens": 322893591.0, "reward": 1.2059152126312256, "reward_std": 0.38405469059944153, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7059151530265808, "rewards/tag_count_reward/std": 0.3287724554538727, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 996.1563110351562, "completions/mean_terminated_length": 713.0821533203125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10143306163763252, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15765461278053922, "kl": 0.0155487060546875, "learning_rate": 9.999968869965026e-07, "loss": 0.1592, "num_tokens": 323409021.0, "reward": 1.20703125, "reward_std": 0.3859425485134125, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7159598469734192, "rewards/tag_count_reward/std": 0.31831979751586914, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1051.2232666015625, "completions/mean_terminated_length": 817.8181762695312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.10164615630493846, "frac_reward_zero_std": 0.0, "grad_norm": 0.13197322469146067, "kl": 0.0146484375, "learning_rate": 9.99995517277238e-07, "loss": 0.1098, "num_tokens": 323945057.0, "reward": 1.2494419813156128, "reward_std": 0.38459935784339905, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7516741156578064, "rewards/tag_count_reward/std": 0.28304529190063477, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1154.66748046875, "completions/mean_terminated_length": 831.547119140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.10185925097224442, "frac_reward_zero_std": 0.0, "grad_norm": 0.1326367157254861, "kl": 0.0151214599609375, "learning_rate": 9.999938985198985e-07, "loss": 0.1155, "num_tokens": 324530812.0, "reward": 1.1584821939468384, "reward_std": 0.4268447160720825, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6964285969734192, "rewards/tag_count_reward/std": 0.3131684362888336, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1078.15625, "completions/mean_terminated_length": 841.0833740234375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.10207234563955037, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13267499481677836, "kl": 0.0180511474609375, "learning_rate": 9.999920307253804e-07, "loss": 0.0967, "num_tokens": 325078066.0, "reward": 1.3738839626312256, "reward_std": 0.4221484959125519, "rewards/accuracy_reward/mean": 0.6319444179534912, "rewards/accuracy_reward/std": 0.48283571004867554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7645089030265808, "rewards/tag_count_reward/std": 0.31251198053359985, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1055.3660888671875, "completions/mean_terminated_length": 812.7222290039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.10228544030685632, "frac_reward_zero_std": 0.0, "grad_norm": 0.13259897750583752, "kl": 0.015777587890625, "learning_rate": 9.999899138947174e-07, "loss": 0.0998, "num_tokens": 325620182.0, "reward": 1.2059152126312256, "reward_std": 0.4361419975757599, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7327008843421936, "rewards/tag_count_reward/std": 0.298405259847641, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1241.21875, "completions/mean_terminated_length": 900.5778198242188, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10249853497416227, "frac_reward_zero_std": 0.0, "grad_norm": 0.11718697952367829, "kl": 0.01312255859375, "learning_rate": 9.999875480290809e-07, "loss": 0.1043, "num_tokens": 326246664.0, "reward": 1.172991156578064, "reward_std": 0.4758950173854828, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6886160969734192, "rewards/tag_count_reward/std": 0.3223131597042084, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 999.1920166015625, "completions/mean_terminated_length": 781.5148315429688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.10271162964146822, "frac_reward_zero_std": 0.0, "grad_norm": 0.13993229328415177, "kl": 0.016937255859375, "learning_rate": 9.999849331297799e-07, "loss": 0.1095, "num_tokens": 326764654.0, "reward": 1.3627232313156128, "reward_std": 0.3963104486465454, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7533482313156128, "rewards/tag_count_reward/std": 0.29958948493003845, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1074.4129638671875, "completions/mean_terminated_length": 822.8118286132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.10292472430877417, "frac_reward_zero_std": 0.0, "grad_norm": 0.41277423822743126, "kl": 0.037567138671875, "learning_rate": 9.99982069198262e-07, "loss": 0.1098, "num_tokens": 327313159.0, "reward": 1.2533482313156128, "reward_std": 0.4396021366119385, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.30418795347213745, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1064.6629638671875, "completions/mean_terminated_length": 732.9701538085938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.10313781897608013, "frac_reward_zero_std": 0.0, "grad_norm": 0.16347744786689283, "kl": 0.0197296142578125, "learning_rate": 9.99978956236112e-07, "loss": 0.1829, "num_tokens": 327856560.0, "reward": 1.1891741752624512, "reward_std": 0.3950643837451935, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7137276530265808, "rewards/tag_count_reward/std": 0.3189505934715271, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1045.34375, "completions/mean_terminated_length": 764.5999755859375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.10335091364338607, "frac_reward_zero_std": 0.0, "grad_norm": 0.4509870575701992, "kl": 0.048583984375, "learning_rate": 9.999755942450525e-07, "loss": 0.1119, "num_tokens": 328396330.0, "reward": 1.278459906578064, "reward_std": 0.41523391008377075, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7271205186843872, "rewards/tag_count_reward/std": 0.34091922640800476, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1137.279052734375, "completions/mean_terminated_length": 788.7315063476562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.10356400831069203, "frac_reward_zero_std": 0.0, "grad_norm": 0.1267572031800838, "kl": 0.0137481689453125, "learning_rate": 9.999719832269443e-07, "loss": 0.0779, "num_tokens": 328971063.0, "reward": 1.23828125, "reward_std": 0.4070073068141937, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7092633843421936, "rewards/tag_count_reward/std": 0.3270745277404785, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1166.1451416015625, "completions/mean_terminated_length": 828.6450805664062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.10377710297799797, "frac_reward_zero_std": 0.0, "grad_norm": 0.11671655775877127, "kl": 0.01446533203125, "learning_rate": 9.99968123183786e-07, "loss": 0.1079, "num_tokens": 329568664.0, "reward": 1.1194196939468384, "reward_std": 0.42910727858543396, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6908482313156128, "rewards/tag_count_reward/std": 0.3424888253211975, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 987.9152221679688, "completions/mean_terminated_length": 798.2158203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10399019764530393, "frac_reward_zero_std": 0.0, "grad_norm": 0.1431692286925934, "kl": 0.0169525146484375, "learning_rate": 9.999640141177135e-07, "loss": 0.1129, "num_tokens": 330083986.0, "reward": 1.313616156578064, "reward_std": 0.4264586567878723, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.2877199947834015, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1058.3795166015625, "completions/mean_terminated_length": 762.9275512695312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10420329231260987, "frac_reward_zero_std": 0.0, "grad_norm": 0.1453310242710074, "kl": 0.0164642333984375, "learning_rate": 9.999596560310011e-07, "loss": 0.1413, "num_tokens": 330627100.0, "reward": 1.2639509439468384, "reward_std": 0.4461327791213989, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7327008843421936, "rewards/tag_count_reward/std": 0.30808794498443604, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1297.5335693359375, "completions/mean_terminated_length": 880.607666015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10441638697991583, "frac_reward_zero_std": 0.0, "grad_norm": 0.11376023971570538, "kl": 0.0116119384765625, "learning_rate": 9.999550489260604e-07, "loss": 0.1491, "num_tokens": 331289371.0, "reward": 1.047991156578064, "reward_std": 0.45523765683174133, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6439732313156128, "rewards/tag_count_reward/std": 0.33682864904403687, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1158.3348388671875, "completions/mean_terminated_length": 851.0931396484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.10462948164722177, "frac_reward_zero_std": 0.0, "grad_norm": 0.13143303938856157, "kl": 0.01300048828125, "learning_rate": 9.999501928054414e-07, "loss": 0.1293, "num_tokens": 331879953.0, "reward": 1.180803656578064, "reward_std": 0.49052533507347107, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6964285969734192, "rewards/tag_count_reward/std": 0.3219740688800812, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1104.97998046875, "completions/mean_terminated_length": 715.277587890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.10484257631452773, "frac_reward_zero_std": 0.0, "grad_norm": 0.12751031398177987, "kl": 0.0137939453125, "learning_rate": 9.999450876718313e-07, "loss": 0.1371, "num_tokens": 332437768.0, "reward": 1.2064732313156128, "reward_std": 0.4385809898376465, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6930803656578064, "rewards/tag_count_reward/std": 0.3603644073009491, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1027.2410888671875, "completions/mean_terminated_length": 777.7222290039062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10505567098183367, "frac_reward_zero_std": 0.0, "grad_norm": 0.14254317945035394, "kl": 0.0149688720703125, "learning_rate": 9.999397335280558e-07, "loss": 0.1304, "num_tokens": 332967540.0, "reward": 1.1121652126312256, "reward_std": 0.3940357565879822, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7237723469734192, "rewards/tag_count_reward/std": 0.29586759209632874, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1213.1875, "completions/mean_terminated_length": 928.2515258789062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.10526876564913963, "frac_reward_zero_std": 0.0, "grad_norm": 0.1218325779123523, "kl": 0.0144805908203125, "learning_rate": 9.999341303770773e-07, "loss": 0.1287, "num_tokens": 333573464.0, "reward": 1.21484375, "reward_std": 0.48583367466926575, "rewards/accuracy_reward/mean": 0.5115740895271301, "rewards/accuracy_reward/std": 0.5004456043243408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7215401530265808, "rewards/tag_count_reward/std": 0.32709363102912903, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1063.43310546875, "completions/mean_terminated_length": 769.4898681640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.10548186031644557, "frac_reward_zero_std": 0.0, "grad_norm": 0.14470465965765336, "kl": 0.0158233642578125, "learning_rate": 9.999282782219976e-07, "loss": 0.1564, "num_tokens": 334120170.0, "reward": 1.223772406578064, "reward_std": 0.4154900312423706, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7126116156578064, "rewards/tag_count_reward/std": 0.3035444915294647, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1091.078125, "completions/mean_terminated_length": 779.6538696289062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.10569495498375153, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272113650096219, "kl": 0.01385498046875, "learning_rate": 9.999221770660548e-07, "loss": 0.1356, "num_tokens": 334676637.0, "reward": 1.1941964626312256, "reward_std": 0.4048629105091095, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6852678656578064, "rewards/tag_count_reward/std": 0.31639668345451355, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1254.571533203125, "completions/mean_terminated_length": 937.2000122070312, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10590804965105748, "frac_reward_zero_std": 0.0, "grad_norm": 0.11722924563048634, "kl": 0.01226806640625, "learning_rate": 9.999158269126255e-07, "loss": 0.1144, "num_tokens": 335307677.0, "reward": 1.176897406578064, "reward_std": 0.478923499584198, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6746651530265808, "rewards/tag_count_reward/std": 0.34399595856666565, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1258.993408203125, "completions/mean_terminated_length": 877.552978515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10612114431836343, "frac_reward_zero_std": 0.0, "grad_norm": 0.11749255523740923, "kl": 0.013092041015625, "learning_rate": 9.999092277652242e-07, "loss": 0.1208, "num_tokens": 335935226.0, "reward": 1.1875, "reward_std": 0.3897707164287567, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6941964030265808, "rewards/tag_count_reward/std": 0.3523011803627014, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1086.515625, "completions/mean_terminated_length": 813.773681640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.10633423898566938, "frac_reward_zero_std": 0.0, "grad_norm": 0.13820285403362007, "kl": 0.014739990234375, "learning_rate": 9.99902379627503e-07, "loss": 0.151, "num_tokens": 336490241.0, "reward": 1.2142857313156128, "reward_std": 0.42686742544174194, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.3142428994178772, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1049.884033203125, "completions/mean_terminated_length": 795.4622192382812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10654733365297533, "frac_reward_zero_std": 0.0, "grad_norm": 0.38228978955619164, "kl": 0.0238800048828125, "learning_rate": 9.99895282503252e-07, "loss": 0.1193, "num_tokens": 337027773.0, "reward": 1.1908482313156128, "reward_std": 0.44005751609802246, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7310267686843872, "rewards/tag_count_reward/std": 0.2952408790588379, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1009.3192138671875, "completions/mean_terminated_length": 714.6791381835938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.10676042832028128, "frac_reward_zero_std": 0.0, "grad_norm": 0.15334362466840795, "kl": 0.015472412109375, "learning_rate": 9.998879363963983e-07, "loss": 0.2147, "num_tokens": 337549964.0, "reward": 1.2622768878936768, "reward_std": 0.44516825675964355, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7310267686843872, "rewards/tag_count_reward/std": 0.3059394359588623, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1176.57373046875, "completions/mean_terminated_length": 824.1786499023438, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.10697352298758724, "frac_reward_zero_std": 0.0, "grad_norm": 0.15620953685457023, "kl": 0.0352935791015625, "learning_rate": 9.99880341311008e-07, "loss": 0.1333, "num_tokens": 338151773.0, "reward": 1.1629464626312256, "reward_std": 0.38759276270866394, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6941964030265808, "rewards/tag_count_reward/std": 0.3368823826313019, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1075.3482666015625, "completions/mean_terminated_length": 711.3496704101562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1071866176548932, "frac_reward_zero_std": 0.0, "grad_norm": 0.13664039744713277, "kl": 0.01177978515625, "learning_rate": 9.998724972512838e-07, "loss": 0.1406, "num_tokens": 338705417.0, "reward": 0.9782366752624512, "reward_std": 0.34112757444381714, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6501116156578064, "rewards/tag_count_reward/std": 0.3228525221347809, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1134.024658203125, "completions/mean_terminated_length": 822.0689086914062, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.10739971232219914, "frac_reward_zero_std": 0.0, "grad_norm": 0.11694709474631049, "kl": 0.0136566162109375, "learning_rate": 9.998644042215675e-07, "loss": 0.1044, "num_tokens": 339293860.0, "reward": 1.07421875, "reward_std": 0.3680979609489441, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6791294813156128, "rewards/tag_count_reward/std": 0.3392232060432434, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1118.4910888671875, "completions/mean_terminated_length": 804.9552001953125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1076128069895051, "frac_reward_zero_std": 0.0, "grad_norm": 0.11700136495266426, "kl": 0.01434326171875, "learning_rate": 9.998560622263376e-07, "loss": 0.0674, "num_tokens": 339860240.0, "reward": 1.2890625, "reward_std": 0.41393178701400757, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7533482313156128, "rewards/tag_count_reward/std": 0.30696600675582886, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1138.8013916015625, "completions/mean_terminated_length": 817.4229736328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10782590165681104, "frac_reward_zero_std": 0.0, "grad_norm": 0.12451529046004653, "kl": 0.0135040283203125, "learning_rate": 9.998474712702108e-07, "loss": 0.1283, "num_tokens": 340444151.0, "reward": 1.2790179252624512, "reward_std": 0.40313443541526794, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.31778252124786377, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 999.482177734375, "completions/mean_terminated_length": 721.0621337890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.108038996324117, "frac_reward_zero_std": 0.0, "grad_norm": 0.15103539730126336, "kl": 0.0154266357421875, "learning_rate": 9.998386313579417e-07, "loss": 0.1696, "num_tokens": 340967183.0, "reward": 1.1227679252624512, "reward_std": 0.3666459619998932, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7142857313156128, "rewards/tag_count_reward/std": 0.31124910712242126, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1067.953125, "completions/mean_terminated_length": 771.6598510742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.10825209099142294, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.8341277634904317, "kl": 0.02423095703125, "learning_rate": 9.998295424944222e-07, "loss": 0.1571, "num_tokens": 341517898.0, "reward": 1.1540179252624512, "reward_std": 0.3726150095462799, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6941964030265808, "rewards/tag_count_reward/std": 0.32159388065338135, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1076.337158203125, "completions/mean_terminated_length": 771.4457397460938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1084651856587289, "frac_reward_zero_std": 0.0, "grad_norm": 0.12928863970447124, "kl": 0.01434326171875, "learning_rate": 9.998202046846825e-07, "loss": 0.1443, "num_tokens": 342068881.0, "reward": 1.2008929252624512, "reward_std": 0.4575641453266144, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7209821343421936, "rewards/tag_count_reward/std": 0.3268306255340576, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1073.622802734375, "completions/mean_terminated_length": 811.3966064453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.10867828032603484, "frac_reward_zero_std": 0.0, "grad_norm": 0.14403471799927503, "kl": 0.015899658203125, "learning_rate": 9.998106179338903e-07, "loss": 0.1677, "num_tokens": 342620728.0, "reward": 1.2935268878936768, "reward_std": 0.44815048575401306, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.2971627712249756, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1084.7857666015625, "completions/mean_terminated_length": 744.314208984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1088913749933408, "frac_reward_zero_std": 0.0, "grad_norm": 0.13609437807517724, "kl": 0.0151519775390625, "learning_rate": 9.99800782247351e-07, "loss": 0.1243, "num_tokens": 343173864.0, "reward": 1.1752232313156128, "reward_std": 0.4100801646709442, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7176339030265808, "rewards/tag_count_reward/std": 0.3147410750389099, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1065.029052734375, "completions/mean_terminated_length": 821.33984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.10910446966064674, "frac_reward_zero_std": 0.0, "grad_norm": 0.12795470144425597, "kl": 0.0147247314453125, "learning_rate": 9.997906976305082e-07, "loss": 0.1238, "num_tokens": 343717493.0, "reward": 1.2215402126312256, "reward_std": 0.4134916663169861, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7572544813156128, "rewards/tag_count_reward/std": 0.29267311096191406, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1140.0826416015625, "completions/mean_terminated_length": 796.4707641601562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1093175643279527, "frac_reward_zero_std": 0.0, "grad_norm": 0.12921375300348603, "kl": 0.0121002197265625, "learning_rate": 9.997803640889428e-07, "loss": 0.0828, "num_tokens": 344305290.0, "reward": 1.1049107313156128, "reward_std": 0.412174791097641, "rewards/accuracy_reward/mean": 0.39814814925193787, "rewards/accuracy_reward/std": 0.49008384346961975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7209821343421936, "rewards/tag_count_reward/std": 0.31194621324539185, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1014.450927734375, "completions/mean_terminated_length": 754.6201171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.10953065899525864, "frac_reward_zero_std": 0.0, "grad_norm": 0.14095531891109297, "kl": 0.016845703125, "learning_rate": 9.997697816283734e-07, "loss": 0.1177, "num_tokens": 344837860.0, "reward": 1.3264509439468384, "reward_std": 0.45388615131378174, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7728794813156128, "rewards/tag_count_reward/std": 0.28849291801452637, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1111.3504638671875, "completions/mean_terminated_length": 817.4457397460938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1097437536625646, "frac_reward_zero_std": 0.0, "grad_norm": 0.13073510604903013, "kl": 0.0164642333984375, "learning_rate": 9.997589502546572e-07, "loss": 0.1327, "num_tokens": 345403857.0, "reward": 1.2645089626312256, "reward_std": 0.3639766275882721, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7533482313156128, "rewards/tag_count_reward/std": 0.3105885982513428, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1125.419677734375, "completions/mean_terminated_length": 810.5269775390625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.10995684832987054, "frac_reward_zero_std": 0.0, "grad_norm": 0.12461563808646403, "kl": 0.0165557861328125, "learning_rate": 9.997478699737879e-07, "loss": 0.1137, "num_tokens": 345979725.0, "reward": 1.2477679252624512, "reward_std": 0.3813445270061493, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7321428656578064, "rewards/tag_count_reward/std": 0.34019383788108826, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1050.01123046875, "completions/mean_terminated_length": 755.8063354492188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1101699429971765, "frac_reward_zero_std": 0.0, "grad_norm": 0.14330925751302245, "kl": 0.01873779296875, "learning_rate": 9.997365407918978e-07, "loss": 0.1196, "num_tokens": 346518402.0, "reward": 1.1930804252624512, "reward_std": 0.4103604555130005, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.30229419469833374, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1137.90185546875, "completions/mean_terminated_length": 886.3931274414062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.11038303766448244, "frac_reward_zero_std": 0.0, "grad_norm": 0.13847372616222797, "kl": 0.01666259765625, "learning_rate": 9.99724962715257e-07, "loss": 0.0695, "num_tokens": 347099766.0, "reward": 1.3543527126312256, "reward_std": 0.41691386699676514, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.3035238981246948, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1168.560302734375, "completions/mean_terminated_length": 868.3922729492188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1105961323317884, "frac_reward_zero_std": 0.0, "grad_norm": 0.1235694863220792, "kl": 0.0141143798828125, "learning_rate": 9.997131357502726e-07, "loss": 0.0638, "num_tokens": 347693329.0, "reward": 1.1422991752624512, "reward_std": 0.4199191629886627, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7003348469734192, "rewards/tag_count_reward/std": 0.3215157389640808, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1343.1875, "completions/mean_terminated_length": 895.6058349609375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.11080922699909435, "frac_reward_zero_std": 0.0, "grad_norm": 0.11935521466152466, "kl": 0.013671875, "learning_rate": 9.9970105990349e-07, "loss": 0.1195, "num_tokens": 348368373.0, "reward": 0.9793527126312256, "reward_std": 0.4353788495063782, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6422991156578064, "rewards/tag_count_reward/std": 0.35628247261047363, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 984.075927734375, "completions/mean_terminated_length": 716.6089477539062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1110223216664003, "frac_reward_zero_std": 0.0, "grad_norm": 18.379072375856186, "kl": 0.035186767578125, "learning_rate": 9.99688735181593e-07, "loss": 0.1555, "num_tokens": 348881271.0, "reward": 1.1266741752624512, "reward_std": 0.3916095197200775, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6869419813156128, "rewards/tag_count_reward/std": 0.3260883092880249, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1219.0179443359375, "completions/mean_terminated_length": 826.3421020507812, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.11123541633370625, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16346793855284342, "kl": 0.0158233642578125, "learning_rate": 9.996761615914013e-07, "loss": 0.1993, "num_tokens": 349499199.0, "reward": 1.0379464626312256, "reward_std": 0.4653177857398987, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6651785969734192, "rewards/tag_count_reward/std": 0.34685155749320984, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1081.5625, "completions/mean_terminated_length": 782.0233764648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1114485110010122, "frac_reward_zero_std": 0.0, "grad_norm": 0.15460437359577386, "kl": 0.0168609619140625, "learning_rate": 9.996633391398742e-07, "loss": 0.1182, "num_tokens": 350059067.0, "reward": 1.3203125, "reward_std": 0.43013155460357666, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7488839030265808, "rewards/tag_count_reward/std": 0.33066415786743164, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1116.5960693359375, "completions/mean_terminated_length": 859.1994018554688, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.11166160566831815, "frac_reward_zero_std": 0.0, "grad_norm": 0.14815642598142983, "kl": 0.0152740478515625, "learning_rate": 9.996502678341075e-07, "loss": 0.0672, "num_tokens": 350627926.0, "reward": 1.313616156578064, "reward_std": 0.4055522084236145, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.31701749563217163, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1168.4107666015625, "completions/mean_terminated_length": 824.2236328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1118747003356241, "frac_reward_zero_std": 0.0, "grad_norm": 0.12272438572086954, "kl": 0.0148468017578125, "learning_rate": 9.996369476813355e-07, "loss": 0.1123, "num_tokens": 351219918.0, "reward": 1.2209821939468384, "reward_std": 0.477637380361557, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7321428656578064, "rewards/tag_count_reward/std": 0.3167831301689148, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1072.216552734375, "completions/mean_terminated_length": 872.8629150390625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.11208779500293005, "frac_reward_zero_std": 0.0, "grad_norm": 0.12197009621679969, "kl": 0.0158538818359375, "learning_rate": 9.996233786889298e-07, "loss": 0.0793, "num_tokens": 351767055.0, "reward": 1.2918527126312256, "reward_std": 0.4216061532497406, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.7583705186843872, "rewards/tag_count_reward/std": 0.3034334182739258, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1204.357177734375, "completions/mean_terminated_length": 929.798828125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.112300889670236, "frac_reward_zero_std": 0.0, "grad_norm": 0.27005150137523415, "kl": 0.01641845703125, "learning_rate": 9.996095608643995e-07, "loss": 0.1296, "num_tokens": 352374495.0, "reward": 1.1422991752624512, "reward_std": 0.44155457615852356, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7137276530265808, "rewards/tag_count_reward/std": 0.3198261260986328, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1173.34375, "completions/mean_terminated_length": 856.978759765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.11251398433754195, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260793117385092, "kl": 0.0139923095703125, "learning_rate": 9.99595494215392e-07, "loss": 0.118, "num_tokens": 352966697.0, "reward": 1.0385044813156128, "reward_std": 0.41826698184013367, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6791294813156128, "rewards/tag_count_reward/std": 0.3296067714691162, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 995.8839721679688, "completions/mean_terminated_length": 842.5064086914062, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.11272707900484791, "frac_reward_zero_std": 0.0, "grad_norm": 0.13190721492863094, "kl": 0.01690673828125, "learning_rate": 9.995811787496922e-07, "loss": 0.0966, "num_tokens": 353477653.0, "reward": 1.3588169813156128, "reward_std": 0.3607838749885559, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7873883843421936, "rewards/tag_count_reward/std": 0.2729867994785309, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1087.779052734375, "completions/mean_terminated_length": 804.7080688476562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.11294017367215385, "frac_reward_zero_std": 0.0, "grad_norm": 0.15513465383177902, "kl": 0.0165252685546875, "learning_rate": 9.995666144752225e-07, "loss": 0.1708, "num_tokens": 354035266.0, "reward": 1.2075893878936768, "reward_std": 0.37594375014305115, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7120535969734192, "rewards/tag_count_reward/std": 0.3185359239578247, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1130.325927734375, "completions/mean_terminated_length": 876.7236328125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.11315326833945981, "frac_reward_zero_std": 0.0, "grad_norm": 0.12860773206909873, "kl": 0.01416015625, "learning_rate": 9.99551801400043e-07, "loss": 0.1314, "num_tokens": 354615556.0, "reward": 1.2767857313156128, "reward_std": 0.4374081492424011, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.31750741600990295, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1130.212158203125, "completions/mean_terminated_length": 775.0309448242188, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.11336636300676575, "frac_reward_zero_std": 0.0, "grad_norm": 0.13410960701045807, "kl": 0.014862060546875, "learning_rate": 9.995367395323516e-07, "loss": 0.1359, "num_tokens": 355181219.0, "reward": 1.157366156578064, "reward_std": 0.3704836964607239, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6908482313156128, "rewards/tag_count_reward/std": 0.32703492045402527, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1196.58935546875, "completions/mean_terminated_length": 817.5741577148438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11357945767407171, "frac_reward_zero_std": 0.0, "grad_norm": 0.11653091325140572, "kl": 0.0120086669921875, "learning_rate": 9.995214288804841e-07, "loss": 0.12, "num_tokens": 355784155.0, "reward": 1.1171875, "reward_std": 0.4092658758163452, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6729910969734192, "rewards/tag_count_reward/std": 0.34178823232650757, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1075.0692138671875, "completions/mean_terminated_length": 879.439697265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.11379255234137765, "frac_reward_zero_std": 0.0, "grad_norm": 0.12166022349041769, "kl": 0.0179443359375, "learning_rate": 9.995058694529135e-07, "loss": 0.1029, "num_tokens": 356333994.0, "reward": 1.4642857313156128, "reward_std": 0.44063979387283325, "rewards/accuracy_reward/mean": 0.6897321343421936, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7745535969734192, "rewards/tag_count_reward/std": 0.2953021824359894, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1121.265625, "completions/mean_terminated_length": 919.8016357421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.11400564700868361, "frac_reward_zero_std": 0.0, "grad_norm": 1.7425105937638388, "kl": 0.092041015625, "learning_rate": 9.99490061258251e-07, "loss": 0.1163, "num_tokens": 356912545.0, "reward": 1.329241156578064, "reward_std": 0.47690925002098083, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7198660969734192, "rewards/tag_count_reward/std": 0.3158497214317322, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1282.638427734375, "completions/mean_terminated_length": 927.4705810546875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.11421874167598955, "frac_reward_zero_std": 0.0, "grad_norm": 0.1333996275925524, "kl": 0.0131988525390625, "learning_rate": 9.994740043052451e-07, "loss": 0.0849, "num_tokens": 357565855.0, "reward": 1.040178656578064, "reward_std": 0.40426236391067505, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.33617010712623596, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1285.852783203125, "completions/mean_terminated_length": 928.5180053710938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.11443183634329551, "frac_reward_zero_std": 0.0, "grad_norm": 0.12787905799468002, "kl": 0.0139007568359375, "learning_rate": 9.99457698602782e-07, "loss": 0.141, "num_tokens": 358212349.0, "reward": 1.1077009439468384, "reward_std": 0.4428555369377136, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.6746651530265808, "rewards/tag_count_reward/std": 0.3332604467868805, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 994.2835083007812, "completions/mean_terminated_length": 792.5079345703125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11464493101060146, "frac_reward_zero_std": 0.0, "grad_norm": 0.14556751045745503, "kl": 0.0163421630859375, "learning_rate": 9.994411441598858e-07, "loss": 0.1171, "num_tokens": 358726508.0, "reward": 1.2075893878936768, "reward_std": 0.39020565152168274, "rewards/accuracy_reward/mean": 0.4791666567325592, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7455357313156128, "rewards/tag_count_reward/std": 0.3000412583351135, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1054.85498046875, "completions/mean_terminated_length": 861.5226440429688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11485802567790741, "frac_reward_zero_std": 0.0, "grad_norm": 0.13246574499671018, "kl": 0.0164947509765625, "learning_rate": 9.994243409857184e-07, "loss": 0.1355, "num_tokens": 359273643.0, "reward": 1.1802456378936768, "reward_std": 0.37442320585250854, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.7315848469734192, "rewards/tag_count_reward/std": 0.2902616858482361, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1084.305908203125, "completions/mean_terminated_length": 792.9564208984375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11507112034521336, "frac_reward_zero_std": 0.0, "grad_norm": 0.14541726033840127, "kl": 0.0151214599609375, "learning_rate": 9.994072890895786e-07, "loss": 0.1289, "num_tokens": 359832708.0, "reward": 1.0943081378936768, "reward_std": 0.4312390685081482, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6947544813156128, "rewards/tag_count_reward/std": 0.3253639042377472, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1069.6116943359375, "completions/mean_terminated_length": 802.7784423828125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11528421501251931, "frac_reward_zero_std": 0.0, "grad_norm": 0.13979966313418196, "kl": 0.0152130126953125, "learning_rate": 9.993899884809032e-07, "loss": 0.1432, "num_tokens": 360374870.0, "reward": 1.1389509439468384, "reward_std": 0.41030871868133545, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6880580186843872, "rewards/tag_count_reward/std": 0.32285642623901367, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1106.446533203125, "completions/mean_terminated_length": 788.8477783203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.11549730967982526, "frac_reward_zero_std": 0.0, "grad_norm": 0.22971373969713504, "kl": 0.018524169921875, "learning_rate": 9.993724391692675e-07, "loss": 0.1045, "num_tokens": 360934878.0, "reward": 1.2349331378936768, "reward_std": 0.3717692792415619, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.32729583978652954, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1030.2054443359375, "completions/mean_terminated_length": 767.1798095703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.11571040434713122, "frac_reward_zero_std": 0.0, "grad_norm": 0.1429312777871901, "kl": 0.02386474609375, "learning_rate": 9.993546411643828e-07, "loss": 0.1664, "num_tokens": 361466442.0, "reward": 1.2912946939468384, "reward_std": 0.36561089754104614, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7243303656578064, "rewards/tag_count_reward/std": 0.309089720249176, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1215.1138916015625, "completions/mean_terminated_length": 881.9594116210938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.11592349901443716, "frac_reward_zero_std": 0.0, "grad_norm": 0.12404947796649639, "kl": 0.013031005859375, "learning_rate": 9.993365944760997e-07, "loss": 0.1157, "num_tokens": 362085661.0, "reward": 1.15234375, "reward_std": 0.4340263307094574, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.33053719997406006, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1044.35498046875, "completions/mean_terminated_length": 792.0418701171875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.11613659368174312, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293230117473075, "kl": 0.015289306640625, "learning_rate": 9.993182991144052e-07, "loss": 0.1189, "num_tokens": 362618428.0, "reward": 1.2516741752624512, "reward_std": 0.40109357237815857, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.30781227350234985, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1100.3192138671875, "completions/mean_terminated_length": 865.3788452148438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11634968834904906, "frac_reward_zero_std": 0.0, "grad_norm": 0.1279928632691155, "kl": 0.0162200927734375, "learning_rate": 9.992997550894246e-07, "loss": 0.1052, "num_tokens": 363177515.0, "reward": 1.266741156578064, "reward_std": 0.40209218859672546, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7176339030265808, "rewards/tag_count_reward/std": 0.31783556938171387, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1085.5357666015625, "completions/mean_terminated_length": 843.5753784179688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.11656278301635502, "frac_reward_zero_std": 0.0, "grad_norm": 0.1333956759794662, "kl": 0.01458740234375, "learning_rate": 9.992809624114205e-07, "loss": 0.1045, "num_tokens": 363726811.0, "reward": 1.2427456378936768, "reward_std": 0.4335285723209381, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7137276530265808, "rewards/tag_count_reward/std": 0.30734148621559143, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1170.7835693359375, "completions/mean_terminated_length": 823.7227172851562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.11677587768366096, "frac_reward_zero_std": 0.0, "grad_norm": 0.12050974634552532, "kl": 0.014495849609375, "learning_rate": 9.992619210907934e-07, "loss": 0.1517, "num_tokens": 364320474.0, "reward": 1.204241156578064, "reward_std": 0.4439477324485779, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6752232313156128, "rewards/tag_count_reward/std": 0.3255350887775421, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1095.7545166015625, "completions/mean_terminated_length": 885.5858154296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.11698897235096692, "frac_reward_zero_std": 0.0, "grad_norm": 0.11787082270924935, "kl": 0.013092041015625, "learning_rate": 9.992426311380808e-07, "loss": 0.1024, "num_tokens": 364884220.0, "reward": 1.1590402126312256, "reward_std": 0.4144341051578522, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7215401530265808, "rewards/tag_count_reward/std": 0.30725616216659546, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1016.9375610351562, "completions/mean_terminated_length": 754.11767578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.11720206701827286, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14411149970135298, "kl": 0.0152435302734375, "learning_rate": 9.992230925639584e-07, "loss": 0.1037, "num_tokens": 365408256.0, "reward": 1.2248884439468384, "reward_std": 0.3508469760417938, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7248883843421936, "rewards/tag_count_reward/std": 0.28193163871765137, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1160.055908203125, "completions/mean_terminated_length": 827.7576293945312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.11741516168557882, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260065380952364, "kl": 0.0141754150390625, "learning_rate": 9.992033053792397e-07, "loss": 0.1072, "num_tokens": 366000169.0, "reward": 1.05859375, "reward_std": 0.4481099247932434, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6858258843421936, "rewards/tag_count_reward/std": 0.32111555337905884, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1118.296875, "completions/mean_terminated_length": 797.228271484375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.11762825635288476, "frac_reward_zero_std": 0.0, "grad_norm": 0.12346759248171382, "kl": 0.0135345458984375, "learning_rate": 9.991832695948747e-07, "loss": 0.117, "num_tokens": 366566158.0, "reward": 1.0814732313156128, "reward_std": 0.3837091326713562, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.32781273126602173, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1055.450927734375, "completions/mean_terminated_length": 812.8278198242188, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.11784135102019072, "frac_reward_zero_std": 0.0, "grad_norm": 0.14131325662348282, "kl": 0.0155487060546875, "learning_rate": 9.991629852219523e-07, "loss": 0.128, "num_tokens": 367104088.0, "reward": 1.1489956378936768, "reward_std": 0.4467398226261139, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.308691143989563, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 884.82373046875, "completions/mean_terminated_length": 745.2424926757812, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.11805444568749666, "frac_reward_zero_std": 0.0, "grad_norm": 0.25119218626675094, "kl": 0.02606201171875, "learning_rate": 9.991424522716978e-07, "loss": 0.0535, "num_tokens": 367568553.0, "reward": 1.3816964626312256, "reward_std": 0.34707847237586975, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7857142686843872, "rewards/tag_count_reward/std": 0.27448779344558716, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1129.6273193359375, "completions/mean_terminated_length": 869.1146240234375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.11826754035480262, "frac_reward_zero_std": 0.0, "grad_norm": 0.11701143350943649, "kl": 0.01519775390625, "learning_rate": 9.99121670755475e-07, "loss": 0.1171, "num_tokens": 368145570.0, "reward": 1.2075893878936768, "reward_std": 0.46509990096092224, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7142857313156128, "rewards/tag_count_reward/std": 0.31879445910453796, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1064.529052734375, "completions/mean_terminated_length": 857.2026977539062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.11848063502210857, "frac_reward_zero_std": 0.0, "grad_norm": 0.12077757181883189, "kl": 0.0151519775390625, "learning_rate": 9.99100640684785e-07, "loss": 0.0744, "num_tokens": 368693471.0, "reward": 1.1813616752624512, "reward_std": 0.4389062821865082, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7349330186843872, "rewards/tag_count_reward/std": 0.2856007516384125, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1063.118408203125, "completions/mean_terminated_length": 842.4617309570312, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.11869372968941452, "frac_reward_zero_std": 0.0, "grad_norm": 0.13249580267094713, "kl": 0.01611328125, "learning_rate": 9.990793620712657e-07, "loss": 0.0958, "num_tokens": 369240996.0, "reward": 1.2924107313156128, "reward_std": 0.4899755120277405, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.297794371843338, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1157.19873046875, "completions/mean_terminated_length": 796.9686279296875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.11890682435672047, "frac_reward_zero_std": 0.0, "grad_norm": 0.13194843495605557, "kl": 0.014251708984375, "learning_rate": 9.990578349266939e-07, "loss": 0.0993, "num_tokens": 369834253.0, "reward": 1.0853794813156128, "reward_std": 0.3956058919429779, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.31494081020355225, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1127.8951416015625, "completions/mean_terminated_length": 853.1971435546875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.11911991902402642, "frac_reward_zero_std": 0.0, "grad_norm": 0.12547571916045897, "kl": 0.015045166015625, "learning_rate": 9.990360592629827e-07, "loss": 0.1011, "num_tokens": 370403822.0, "reward": 1.2109375, "reward_std": 0.4083310663700104, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7310267686843872, "rewards/tag_count_reward/std": 0.314503014087677, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 993.19873046875, "completions/mean_terminated_length": 728.025146484375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.11933301369133237, "frac_reward_zero_std": 0.0, "grad_norm": 0.14974636872820007, "kl": 0.01580810546875, "learning_rate": 9.990140350921837e-07, "loss": 0.0939, "num_tokens": 370913143.0, "reward": 1.231584906578064, "reward_std": 0.3409159779548645, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7137276530265808, "rewards/tag_count_reward/std": 0.2846681475639343, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1152.732177734375, "completions/mean_terminated_length": 868.3529663085938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.11954610835863833, "frac_reward_zero_std": 0.0, "grad_norm": 0.129940523584444, "kl": 0.0146484375, "learning_rate": 9.989917624264854e-07, "loss": 0.115, "num_tokens": 371495935.0, "reward": 1.0859375, "reward_std": 0.4487876892089844, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6930803656578064, "rewards/tag_count_reward/std": 0.29031166434288025, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1071.341552734375, "completions/mean_terminated_length": 822.389404296875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.11975920302594427, "frac_reward_zero_std": 0.0, "grad_norm": 0.14072939191506276, "kl": 0.013671875, "learning_rate": 9.989692412782137e-07, "loss": 0.1141, "num_tokens": 372048632.0, "reward": 1.1400669813156128, "reward_std": 0.43616780638694763, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6958705186843872, "rewards/tag_count_reward/std": 0.3065280020236969, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1151.950927734375, "completions/mean_terminated_length": 816.61962890625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.11997229769325023, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13449460635395685, "kl": 0.01605224609375, "learning_rate": 9.989464716598327e-07, "loss": 0.1255, "num_tokens": 372628130.0, "reward": 1.2639509439468384, "reward_std": 0.4298105239868164, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6947544813156128, "rewards/tag_count_reward/std": 0.33924898505210876, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1065.118408203125, "completions/mean_terminated_length": 800.6033935546875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.12018539236055617, "frac_reward_zero_std": 0.0, "grad_norm": 0.13335179386134763, "kl": 0.0149993896484375, "learning_rate": 9.989234535839436e-07, "loss": 0.1432, "num_tokens": 373182423.0, "reward": 1.2114956378936768, "reward_std": 0.4202878773212433, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.30598992109298706, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1239.9598388671875, "completions/mean_terminated_length": 967.3970336914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.12039848702786213, "frac_reward_zero_std": 0.0, "grad_norm": 0.11739770586083305, "kl": 0.0137481689453125, "learning_rate": 9.989001870632852e-07, "loss": 0.0446, "num_tokens": 373805221.0, "reward": 1.2516741752624512, "reward_std": 0.41214022040367126, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7159598469734192, "rewards/tag_count_reward/std": 0.3291178047657013, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1156.4888916015625, "completions/mean_terminated_length": 855.7701416015625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.12061158169516809, "frac_reward_zero_std": 0.0, "grad_norm": 0.13006548620854622, "kl": 0.0133819580078125, "learning_rate": 9.988766721107336e-07, "loss": 0.1195, "num_tokens": 374397952.0, "reward": 1.079241156578064, "reward_std": 0.39850112795829773, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6997767686843872, "rewards/tag_count_reward/std": 0.31505823135375977, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1090.84375, "completions/mean_terminated_length": 805.0841064453125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.12082467636247403, "frac_reward_zero_std": 0.0, "grad_norm": 0.14112731925731012, "kl": 0.015106201171875, "learning_rate": 9.988529087393026e-07, "loss": 0.1089, "num_tokens": 374963338.0, "reward": 1.157366156578064, "reward_std": 0.38730138540267944, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6975446343421936, "rewards/tag_count_reward/std": 0.307956725358963, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1008.2053833007812, "completions/mean_terminated_length": 709.413818359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.12103777102977999, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12950012178344686, "kl": 0.0142669677734375, "learning_rate": 9.988288969621433e-07, "loss": 0.167, "num_tokens": 375482374.0, "reward": 1.1099331378936768, "reward_std": 0.3620772957801819, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6947544813156128, "rewards/tag_count_reward/std": 0.299870103597641, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 963.997802734375, "completions/mean_terminated_length": 799.5861206054688, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.12125086569708593, "frac_reward_zero_std": 0.0, "grad_norm": 0.14305398855083126, "kl": 0.016510009765625, "learning_rate": 9.988046367925445e-07, "loss": 0.0634, "num_tokens": 375986437.0, "reward": 1.3404018878936768, "reward_std": 0.35928723216056824, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7622767686843872, "rewards/tag_count_reward/std": 0.2667539417743683, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1238.15185546875, "completions/mean_terminated_length": 892.5477905273438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.12146396036439189, "frac_reward_zero_std": 0.0, "grad_norm": 0.11384838146082502, "kl": 0.014556884765625, "learning_rate": 9.987801282439321e-07, "loss": 0.1299, "num_tokens": 376613305.0, "reward": 1.1780134439468384, "reward_std": 0.4377499520778656, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6891741156578064, "rewards/tag_count_reward/std": 0.3303444981575012, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1012.30810546875, "completions/mean_terminated_length": 842.8311767578125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.12167705503169783, "frac_reward_zero_std": 0.0, "grad_norm": 0.14804857877856392, "kl": 0.0182952880859375, "learning_rate": 9.987553713298703e-07, "loss": 0.1462, "num_tokens": 377133331.0, "reward": 1.4291294813156128, "reward_std": 0.40965503454208374, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7996651530265808, "rewards/tag_count_reward/std": 0.27764368057250977, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1148.6875, "completions/mean_terminated_length": 827.1151123046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.12189014969900379, "frac_reward_zero_std": 0.0, "grad_norm": 0.1202158473366826, "kl": 0.015350341796875, "learning_rate": 9.987303660640595e-07, "loss": 0.1102, "num_tokens": 377724391.0, "reward": 1.3510044813156128, "reward_std": 0.457707017660141, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7438616156578064, "rewards/tag_count_reward/std": 0.31523799896240234, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 956.9888916015625, "completions/mean_terminated_length": 761.7553100585938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.12210324436630973, "frac_reward_zero_std": 0.0, "grad_norm": 0.13732163923228674, "kl": 0.01849365234375, "learning_rate": 9.987051124603385e-07, "loss": 0.0914, "num_tokens": 378218226.0, "reward": 1.4330357313156128, "reward_std": 0.370051771402359, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7946428656578064, "rewards/tag_count_reward/std": 0.2752145528793335, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1158.962158203125, "completions/mean_terminated_length": 910.0314331054688, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.12231633903361569, "frac_reward_zero_std": 0.0, "grad_norm": 0.11676670658941553, "kl": 0.014923095703125, "learning_rate": 9.986796105326831e-07, "loss": 0.1224, "num_tokens": 378810577.0, "reward": 1.3504464626312256, "reward_std": 0.43793973326683044, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7544642686843872, "rewards/tag_count_reward/std": 0.3181358873844147, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1134.122802734375, "completions/mean_terminated_length": 840.2802124023438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.12252943370092163, "frac_reward_zero_std": 0.0, "grad_norm": 0.13521602211336053, "kl": 0.0147247314453125, "learning_rate": 9.98653860295207e-07, "loss": 0.1528, "num_tokens": 379390984.0, "reward": 1.2126116752624512, "reward_std": 0.3841191828250885, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396106719971, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.29306527972221375, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1142.196533203125, "completions/mean_terminated_length": 857.970703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.12274252836822759, "frac_reward_zero_std": 0.0, "grad_norm": 0.1270612330381728, "kl": 0.0153961181640625, "learning_rate": 9.986278617621607e-07, "loss": 0.1305, "num_tokens": 379970400.0, "reward": 1.0943081378936768, "reward_std": 0.42143312096595764, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7081473469734192, "rewards/tag_count_reward/std": 0.31914231181144714, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 905.5402221679688, "completions/mean_terminated_length": 683.1412963867188, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.12295562303553353, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1563940452782182, "kl": 0.0179443359375, "learning_rate": 9.986016149479323e-07, "loss": 0.1285, "num_tokens": 380448386.0, "reward": 1.31640625, "reward_std": 0.3732053339481354, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7583705186843872, "rewards/tag_count_reward/std": 0.2733752429485321, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1091.85498046875, "completions/mean_terminated_length": 880.8256225585938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.12316871770283949, "frac_reward_zero_std": 0.0, "grad_norm": 0.1324270800302498, "kl": 0.0157928466796875, "learning_rate": 9.985751198670474e-07, "loss": 0.0805, "num_tokens": 381009569.0, "reward": 1.3125, "reward_std": 0.46132099628448486, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7522321343421936, "rewards/tag_count_reward/std": 0.3028491139411926, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1174.9888916015625, "completions/mean_terminated_length": 894.2861328125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.12338181237014544, "frac_reward_zero_std": 0.0, "grad_norm": 0.11677519099572714, "kl": 0.0135955810546875, "learning_rate": 9.985483765341695e-07, "loss": 0.1068, "num_tokens": 381609916.0, "reward": 1.2578125, "reward_std": 0.420994371175766, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.3130228817462921, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1102.8170166015625, "completions/mean_terminated_length": 891.0546264648438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1235949070374514, "frac_reward_zero_std": 0.0, "grad_norm": 0.13369918710314907, "kl": 0.01806640625, "learning_rate": 9.985213849640985e-07, "loss": 0.075, "num_tokens": 382172746.0, "reward": 1.4436384439468384, "reward_std": 0.3795551657676697, "rewards/accuracy_reward/mean": 0.6629464030265808, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7806919813156128, "rewards/tag_count_reward/std": 0.28532519936561584, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1150.1629638671875, "completions/mean_terminated_length": 821.6859741210938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.12380800170475734, "frac_reward_zero_std": 0.0, "grad_norm": 0.1342163847248937, "kl": 0.0158538818359375, "learning_rate": 9.984941451717722e-07, "loss": 0.1109, "num_tokens": 382757251.0, "reward": 1.2466518878936768, "reward_std": 0.3976649045944214, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.3145824074745178, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1115.4263916015625, "completions/mean_terminated_length": 867.7937622070312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1240210963720633, "frac_reward_zero_std": 0.0, "grad_norm": 0.12076706629120385, "kl": 0.0136566162109375, "learning_rate": 9.984666571722663e-07, "loss": 0.0955, "num_tokens": 383325538.0, "reward": 1.2204241752624512, "reward_std": 0.3612907826900482, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7271205186843872, "rewards/tag_count_reward/std": 0.30681297183036804, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1166.4398193359375, "completions/mean_terminated_length": 926.0142211914062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.12423419103936924, "frac_reward_zero_std": 0.0, "grad_norm": 0.12239118289945367, "kl": 0.015380859375, "learning_rate": 9.984389209807924e-07, "loss": 0.1026, "num_tokens": 383923527.0, "reward": 1.2661831378936768, "reward_std": 0.44759076833724976, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.2930056154727936, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1133.8817138671875, "completions/mean_terminated_length": 803.2431640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1244472857066752, "frac_reward_zero_std": 0.0, "grad_norm": 0.13459010859580972, "kl": 0.0155029296875, "learning_rate": 9.98410936612701e-07, "loss": 0.1256, "num_tokens": 384501090.0, "reward": 1.1350446939468384, "reward_std": 0.41443201899528503, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6975446343421936, "rewards/tag_count_reward/std": 0.3204174339771271, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1204.6160888671875, "completions/mean_terminated_length": 923.4880981445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.12466038037398114, "frac_reward_zero_std": 0.0, "grad_norm": 0.27717277354516673, "kl": 0.0333251953125, "learning_rate": 9.983827040834791e-07, "loss": 0.098, "num_tokens": 385112486.0, "reward": 1.2756696939468384, "reward_std": 0.4520913064479828, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7488839030265808, "rewards/tag_count_reward/std": 0.3159603774547577, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1139.3348388671875, "completions/mean_terminated_length": 898.0508422851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1248734750412871, "frac_reward_zero_std": 0.0, "grad_norm": 0.1265985054354985, "kl": 0.015350341796875, "learning_rate": 9.983542234087511e-07, "loss": 0.0929, "num_tokens": 385688828.0, "reward": 1.2377232313156128, "reward_std": 0.3778587579727173, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7154017686843872, "rewards/tag_count_reward/std": 0.3197934329509735, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1244.435302734375, "completions/mean_terminated_length": 919.4827270507812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.12508656970859305, "frac_reward_zero_std": 0.0, "grad_norm": 0.11671678938713659, "kl": 0.0145721435546875, "learning_rate": 9.98325494604279e-07, "loss": 0.0872, "num_tokens": 386313375.0, "reward": 1.1640625, "reward_std": 0.43318504095077515, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7020089030265808, "rewards/tag_count_reward/std": 0.340558797121048, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 969.0714721679688, "completions/mean_terminated_length": 762.4680786132812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.12529966437589898, "frac_reward_zero_std": 0.0, "grad_norm": 0.1318291513333798, "kl": 0.0173187255859375, "learning_rate": 9.982965176859622e-07, "loss": 0.0864, "num_tokens": 386811855.0, "reward": 1.3286831378936768, "reward_std": 0.4062252938747406, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.3179234266281128, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1119.0625, "completions/mean_terminated_length": 852.1264038085938, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.12551275904320494, "frac_reward_zero_std": 0.0, "grad_norm": 0.1303657070856923, "kl": 0.018035888671875, "learning_rate": 9.98267292669837e-07, "loss": 0.0709, "num_tokens": 387387227.0, "reward": 1.2756696939468384, "reward_std": 0.37614575028419495, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7511160969734192, "rewards/tag_count_reward/std": 0.293952614068985, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1265.953125, "completions/mean_terminated_length": 876.2374267578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1257258537105109, "frac_reward_zero_std": 0.0, "grad_norm": 0.12088444046628818, "kl": 0.014068603515625, "learning_rate": 9.982378195720775e-07, "loss": 0.0971, "num_tokens": 388022710.0, "reward": 1.114397406578064, "reward_std": 0.45109930634498596, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6568080186843872, "rewards/tag_count_reward/std": 0.3521164059638977, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1119.3460693359375, "completions/mean_terminated_length": 835.0641479492188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.12593894837781686, "frac_reward_zero_std": 0.0, "grad_norm": 0.13062892459143882, "kl": 0.015625, "learning_rate": 9.98208098408994e-07, "loss": 0.1069, "num_tokens": 388597329.0, "reward": 1.2739956378936768, "reward_std": 0.3894144892692566, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.31106001138687134, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1247.243408203125, "completions/mean_terminated_length": 933.9037475585938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.12615204304512279, "frac_reward_zero_std": 0.0, "grad_norm": 1.9600791058869924, "kl": 0.0240020751953125, "learning_rate": 9.98178129197036e-07, "loss": 0.0865, "num_tokens": 389227262.0, "reward": 1.1813616752624512, "reward_std": 0.5229811072349548, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7193080186843872, "rewards/tag_count_reward/std": 0.30245262384414673, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1244.5357666015625, "completions/mean_terminated_length": 894.3076782226562, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.12636513771242874, "frac_reward_zero_std": 0.0, "grad_norm": 0.12524629749491442, "kl": 0.0147552490234375, "learning_rate": 9.981479119527883e-07, "loss": 0.1135, "num_tokens": 389858910.0, "reward": 1.110491156578064, "reward_std": 0.45997804403305054, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6930803656578064, "rewards/tag_count_reward/std": 0.33083024621009827, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1091.4263916015625, "completions/mean_terminated_length": 809.4306030273438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1265782323797347, "frac_reward_zero_std": 0.0, "grad_norm": 0.1390816633199968, "kl": 0.0156097412109375, "learning_rate": 9.981174466929742e-07, "loss": 0.1027, "num_tokens": 390414397.0, "reward": 1.1400669813156128, "reward_std": 0.4445279836654663, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7293526530265808, "rewards/tag_count_reward/std": 0.29631030559539795, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1122.53125, "completions/mean_terminated_length": 902.6685180664062, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.12679132704704066, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13768040920987817, "kl": 0.0168304443359375, "learning_rate": 9.980867334344539e-07, "loss": 0.0998, "num_tokens": 390989339.0, "reward": 1.2360491752624512, "reward_std": 0.37105876207351685, "rewards/accuracy_reward/mean": 0.4930555522441864, "rewards/accuracy_reward/std": 0.5005314350128174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7606026530265808, "rewards/tag_count_reward/std": 0.27936944365501404, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1103.9442138671875, "completions/mean_terminated_length": 822.095703125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1270044217143466, "frac_reward_zero_std": 0.0, "grad_norm": 0.1261107229382252, "kl": 0.0153045654296875, "learning_rate": 9.980557721942243e-07, "loss": 0.1043, "num_tokens": 391550530.0, "reward": 1.3404018878936768, "reward_std": 0.37021195888519287, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7600446343421936, "rewards/tag_count_reward/std": 0.2966245412826538, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1397.227783203125, "completions/mean_terminated_length": 960.1417846679688, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.12721751638165255, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11546365972495108, "kl": 0.0106658935546875, "learning_rate": 9.98024562989421e-07, "loss": 0.1319, "num_tokens": 392249480.0, "reward": 0.8984375596046448, "reward_std": 0.3560902774333954, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46031373739242554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5948660969734192, "rewards/tag_count_reward/std": 0.349874347448349, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1256.665283203125, "completions/mean_terminated_length": 918.9617919921875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1274306110489585, "frac_reward_zero_std": 0.0, "grad_norm": 0.11326135773012093, "kl": 0.0140380859375, "learning_rate": 9.979931058373155e-07, "loss": 0.0761, "num_tokens": 392877746.0, "reward": 1.1785714626312256, "reward_std": 0.4073047935962677, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7142857313156128, "rewards/tag_count_reward/std": 0.32228410243988037, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1079.763427734375, "completions/mean_terminated_length": 822.6610107421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.12764370571626446, "frac_reward_zero_std": 0.0, "grad_norm": 0.12952250402107507, "kl": 0.015594482421875, "learning_rate": 9.979614007553166e-07, "loss": 0.1066, "num_tokens": 393426520.0, "reward": 1.3459821939468384, "reward_std": 0.4195391535758972, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7633928656578064, "rewards/tag_count_reward/std": 0.30209797620773315, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1003.8638916015625, "completions/mean_terminated_length": 854.7015380859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1278568003835704, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13740573734591055, "kl": 0.018768310546875, "learning_rate": 9.97929447760971e-07, "loss": 0.0842, "num_tokens": 393941323.0, "reward": 1.6010044813156128, "reward_std": 0.34866729378700256, "rewards/accuracy_reward/mean": 0.7767857313156128, "rewards/accuracy_reward/std": 0.41686633229255676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.2626858651638031, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1078.90625, "completions/mean_terminated_length": 855.2692260742188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.12806989505087635, "frac_reward_zero_std": 0.0, "grad_norm": 0.14318462971144025, "kl": 0.017120361328125, "learning_rate": 9.978972468719622e-07, "loss": 0.1325, "num_tokens": 394494945.0, "reward": 1.328125, "reward_std": 0.41569894552230835, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7767857313156128, "rewards/tag_count_reward/std": 0.29652562737464905, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1201.274658203125, "completions/mean_terminated_length": 912.2724609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1282829897181823, "frac_reward_zero_std": 0.0, "grad_norm": 0.12690026125016113, "kl": 0.0154266357421875, "learning_rate": 9.978647981061108e-07, "loss": 0.0603, "num_tokens": 395105820.0, "reward": 1.243303656578064, "reward_std": 0.4231824278831482, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7254464030265808, "rewards/tag_count_reward/std": 0.3254833221435547, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1264.3616943359375, "completions/mean_terminated_length": 961.095947265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.12849608438548826, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11392107489248589, "kl": 0.014801025390625, "learning_rate": 9.978321014813748e-07, "loss": 0.0765, "num_tokens": 395741118.0, "reward": 1.28125, "reward_std": 0.4374000132083893, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7165178656578064, "rewards/tag_count_reward/std": 0.31462404131889343, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 990.6272583007812, "completions/mean_terminated_length": 774.6048583984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1287091790527942, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12087826663143154, "kl": 0.015380859375, "learning_rate": 9.97799157015849e-07, "loss": 0.065, "num_tokens": 396252471.0, "reward": 1.33203125, "reward_std": 0.2973857820034027, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7873883843421936, "rewards/tag_count_reward/std": 0.27553585171699524, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1200.091552734375, "completions/mean_terminated_length": 930.7559204101562, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.12892227372010015, "frac_reward_zero_std": 0.0, "grad_norm": 0.13386125926621395, "kl": 0.01373291015625, "learning_rate": 9.977659647277663e-07, "loss": 0.0993, "num_tokens": 396856832.0, "reward": 1.1936384439468384, "reward_std": 0.4064633548259735, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7181919813156128, "rewards/tag_count_reward/std": 0.3150200843811035, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1033.8482666015625, "completions/mean_terminated_length": 852.3684692382812, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1291353683874061, "frac_reward_zero_std": 0.0, "grad_norm": 0.1425421215203902, "kl": 0.017364501953125, "learning_rate": 9.977325246354956e-07, "loss": 0.0829, "num_tokens": 397381244.0, "reward": 1.2907366752624512, "reward_std": 0.41282179951667786, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.28231218457221985, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1201.227783203125, "completions/mean_terminated_length": 880.7568969726562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.12934846305471207, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13011060426531068, "kl": 0.015533447265625, "learning_rate": 9.976988367575433e-07, "loss": 0.0913, "num_tokens": 397995362.0, "reward": 1.140625, "reward_std": 0.39326173067092896, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7388392686843872, "rewards/tag_count_reward/std": 0.3044935464859009, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1276.247802734375, "completions/mean_terminated_length": 939.8429565429688, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.129561557722018, "frac_reward_zero_std": 0.0, "grad_norm": 0.12135026705591755, "kl": 0.012969970703125, "learning_rate": 9.976649011125534e-07, "loss": 0.1089, "num_tokens": 398641729.0, "reward": 1.153459906578064, "reward_std": 0.45584455132484436, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6958705186843872, "rewards/tag_count_reward/std": 0.32981881499290466, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1050.046875, "completions/mean_terminated_length": 858.949462890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.12977465238932395, "frac_reward_zero_std": 0.0, "grad_norm": 0.13980480078285165, "kl": 0.015380859375, "learning_rate": 9.976307177193067e-07, "loss": 0.1079, "num_tokens": 399185830.0, "reward": 1.1473214626312256, "reward_std": 0.39241477847099304, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7321428656578064, "rewards/tag_count_reward/std": 0.3118901550769806, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1029.7701416015625, "completions/mean_terminated_length": 773.7904663085938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1299877470566299, "frac_reward_zero_std": 0.0, "grad_norm": 0.13438484725231253, "kl": 0.016448974609375, "learning_rate": 9.975962865967208e-07, "loss": 0.0733, "num_tokens": 399715151.0, "reward": 1.3443081378936768, "reward_std": 0.35527294874191284, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7661830186843872, "rewards/tag_count_reward/std": 0.3063283860683441, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1059.13623046875, "completions/mean_terminated_length": 860.302978515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.13020084172393587, "frac_reward_zero_std": 0.0, "grad_norm": 0.12661166257983653, "kl": 0.014129638671875, "learning_rate": 9.975616077638509e-07, "loss": 0.0998, "num_tokens": 400258268.0, "reward": 1.2606027126312256, "reward_std": 0.36463993787765503, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7739955186843872, "rewards/tag_count_reward/std": 0.27652183175086975, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1124.2076416015625, "completions/mean_terminated_length": 841.4140014648438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1304139363912418, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11263923560629889, "kl": 0.01434326171875, "learning_rate": 9.97526681239889e-07, "loss": 0.0706, "num_tokens": 400829353.0, "reward": 1.219866156578064, "reward_std": 0.3816168010234833, "rewards/accuracy_reward/mean": 0.4861111044883728, "rewards/accuracy_reward/std": 0.5003865361213684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7511160969734192, "rewards/tag_count_reward/std": 0.31015416979789734, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1258.325927734375, "completions/mean_terminated_length": 942.4562377929688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.13062703105854775, "frac_reward_zero_std": 0.0, "grad_norm": 0.13944191597400596, "kl": 0.0150146484375, "learning_rate": 9.974915070441643e-07, "loss": 0.0951, "num_tokens": 401462347.0, "reward": 1.1858259439468384, "reward_std": 0.5252411365509033, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.3229955732822418, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1057.4420166015625, "completions/mean_terminated_length": 769.1239013671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.1308401257258537, "frac_reward_zero_std": 0.0, "grad_norm": 0.14184457125869523, "kl": 0.0157623291015625, "learning_rate": 9.974560851961428e-07, "loss": 0.1277, "num_tokens": 402008417.0, "reward": 1.2566964626312256, "reward_std": 0.38438650965690613, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7477678656578064, "rewards/tag_count_reward/std": 0.30606386065483093, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1068.212158203125, "completions/mean_terminated_length": 825.3119506835938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.13105322039315967, "frac_reward_zero_std": 0.0, "grad_norm": 0.13084053709784318, "kl": 0.0151214599609375, "learning_rate": 9.974204157154284e-07, "loss": 0.0942, "num_tokens": 402557200.0, "reward": 1.2215402126312256, "reward_std": 0.4468922019004822, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7237723469734192, "rewards/tag_count_reward/std": 0.305172860622406, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1075.763427734375, "completions/mean_terminated_length": 834.7353515625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1312663150604656, "frac_reward_zero_std": 0.0, "grad_norm": 0.1284271053254254, "kl": 0.0175933837890625, "learning_rate": 9.973844986217606e-07, "loss": 0.0858, "num_tokens": 403103398.0, "reward": 1.2935268878936768, "reward_std": 0.35578545928001404, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.3114555776119232, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1018.90185546875, "completions/mean_terminated_length": 777.9284057617188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.13147940972777156, "frac_reward_zero_std": 0.0, "grad_norm": 0.15305936447695034, "kl": 0.017303466796875, "learning_rate": 9.973483339350173e-07, "loss": 0.0856, "num_tokens": 403639834.0, "reward": 1.1506696939468384, "reward_std": 0.38870009779930115, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7198660969734192, "rewards/tag_count_reward/std": 0.3032013773918152, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1078.1920166015625, "completions/mean_terminated_length": 847.7955932617188, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.13169250439507751, "frac_reward_zero_std": 0.0, "grad_norm": 0.1292829583228314, "kl": 0.01483154296875, "learning_rate": 9.973119216752129e-07, "loss": 0.0648, "num_tokens": 404193264.0, "reward": 1.2650669813156128, "reward_std": 0.44154465198516846, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7516741156578064, "rewards/tag_count_reward/std": 0.2984345257282257, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 992.4777221679688, "completions/mean_terminated_length": 783.6310424804688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.13190559906238347, "frac_reward_zero_std": 0.0, "grad_norm": 0.1516340580583699, "kl": 0.01800537109375, "learning_rate": 9.972752618624986e-07, "loss": 0.1207, "num_tokens": 404698950.0, "reward": 1.3119419813156128, "reward_std": 0.41222789883613586, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7806919813156128, "rewards/tag_count_reward/std": 0.28187403082847595, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1199.415283203125, "completions/mean_terminated_length": 923.24853515625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1321186937296894, "frac_reward_zero_std": 0.0, "grad_norm": 0.1237324935344492, "kl": 0.0140380859375, "learning_rate": 9.97238354517163e-07, "loss": 0.1148, "num_tokens": 405305856.0, "reward": 1.1434152126312256, "reward_std": 0.4352182149887085, "rewards/accuracy_reward/mean": 0.44212964177131653, "rewards/accuracy_reward/std": 0.4972155690193176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7170758843421936, "rewards/tag_count_reward/std": 0.3072642683982849, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1131.3348388671875, "completions/mean_terminated_length": 904.0835571289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.13233178839699536, "frac_reward_zero_std": 0.0, "grad_norm": 0.13563462497660211, "kl": 0.017669677734375, "learning_rate": 9.972011996596311e-07, "loss": 0.0806, "num_tokens": 405878694.0, "reward": 1.364397406578064, "reward_std": 0.39978983998298645, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7818080186843872, "rewards/tag_count_reward/std": 0.2957916557788849, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1010.6250610351562, "completions/mean_terminated_length": 818.5184936523438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.13254488306430132, "frac_reward_zero_std": 0.0, "grad_norm": 0.1401930975661383, "kl": 0.0185546875, "learning_rate": 9.971637973104656e-07, "loss": 0.0855, "num_tokens": 406398078.0, "reward": 1.4313616752624512, "reward_std": 0.43678978085517883, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.27621012926101685, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1185.200927734375, "completions/mean_terminated_length": 901.0148315429688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.13275797773160727, "frac_reward_zero_std": 0.0, "grad_norm": 0.12895423973973746, "kl": 0.01519775390625, "learning_rate": 9.971261474903658e-07, "loss": 0.1075, "num_tokens": 407000456.0, "reward": 1.1372768878936768, "reward_std": 0.39142924547195435, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7154017686843872, "rewards/tag_count_reward/std": 0.2966582179069519, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1261.4375, "completions/mean_terminated_length": 946.8125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1329710723989132, "frac_reward_zero_std": 0.0, "grad_norm": 0.11316949963594328, "kl": 0.013427734375, "learning_rate": 9.970882502201679e-07, "loss": 0.09, "num_tokens": 407639036.0, "reward": 1.1082589626312256, "reward_std": 0.39533933997154236, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6863839030265808, "rewards/tag_count_reward/std": 0.3121762275695801, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1241.669677734375, "completions/mean_terminated_length": 886.4694213867188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.13318416706621916, "frac_reward_zero_std": 0.0, "grad_norm": 0.10642194438336465, "kl": 0.0135040283203125, "learning_rate": 9.970501055208453e-07, "loss": 0.089, "num_tokens": 408261816.0, "reward": 1.1902902126312256, "reward_std": 0.4263664186000824, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6969866156578064, "rewards/tag_count_reward/std": 0.337540864944458, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1141.51123046875, "completions/mean_terminated_length": 884.3696899414062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.13339726173352512, "frac_reward_zero_std": 0.0, "grad_norm": 0.1735016125967563, "kl": 0.019012451171875, "learning_rate": 9.97011713413508e-07, "loss": 0.0894, "num_tokens": 408844989.0, "reward": 1.2645089626312256, "reward_std": 0.421338826417923, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.3121282458305359, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1154.618408203125, "completions/mean_terminated_length": 910.96875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.13361035640083108, "frac_reward_zero_std": 0.0, "grad_norm": 0.12804612253774367, "kl": 0.0163726806640625, "learning_rate": 9.96973073919403e-07, "loss": 0.1015, "num_tokens": 409429666.0, "reward": 1.3203125, "reward_std": 0.45484986901283264, "rewards/accuracy_reward/mean": 0.6111111044883728, "rewards/accuracy_reward/std": 0.488063246011734, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.31657615303993225, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1148.4241943359375, "completions/mean_terminated_length": 788.59375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.13382345106813703, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13172104875519944, "kl": 0.0131683349609375, "learning_rate": 9.969341870599148e-07, "loss": 0.1081, "num_tokens": 410010736.0, "reward": 1.10546875, "reward_std": 0.36399808526039124, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6947544813156128, "rewards/tag_count_reward/std": 0.3317473530769348, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1271.26123046875, "completions/mean_terminated_length": 1039.365234375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.13403654573544296, "frac_reward_zero_std": 0.0, "grad_norm": 0.11875267632845464, "kl": 0.015411376953125, "learning_rate": 9.968950528565637e-07, "loss": 0.0859, "num_tokens": 410656981.0, "reward": 1.2533482313156128, "reward_std": 0.42358607053756714, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.3053349256515503, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1029.671875, "completions/mean_terminated_length": 798.1068725585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.13424964040274892, "frac_reward_zero_std": 0.0, "grad_norm": 0.13772995985420225, "kl": 0.015655517578125, "learning_rate": 9.968556713310077e-07, "loss": 0.1385, "num_tokens": 411184754.0, "reward": 1.266741156578064, "reward_std": 0.39507296681404114, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.30141741037368774, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1140.6317138671875, "completions/mean_terminated_length": 866.31103515625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.13446273507005488, "frac_reward_zero_std": 0.0, "grad_norm": 0.20208071788812898, "kl": 0.0176239013671875, "learning_rate": 9.968160425050418e-07, "loss": 0.1291, "num_tokens": 411766349.0, "reward": 1.227678656578064, "reward_std": 0.4629303216934204, "rewards/accuracy_reward/mean": 0.5462962985038757, "rewards/accuracy_reward/std": 0.4984292685985565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7008928656578064, "rewards/tag_count_reward/std": 0.32311972975730896, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1240.134033203125, "completions/mean_terminated_length": 902.6708984375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.13467582973736084, "frac_reward_zero_std": 0.0, "grad_norm": 0.11959270741761663, "kl": 0.01312255859375, "learning_rate": 9.96776166400597e-07, "loss": 0.1143, "num_tokens": 412390121.0, "reward": 1.08203125, "reward_std": 0.39291685819625854, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6847098469734192, "rewards/tag_count_reward/std": 0.31870388984680176, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1227.71435546875, "completions/mean_terminated_length": 931.0151977539062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.13488892440466677, "frac_reward_zero_std": 0.0, "grad_norm": 0.13621065368008214, "kl": 0.0145721435546875, "learning_rate": 9.967360430397418e-07, "loss": 0.1484, "num_tokens": 413015337.0, "reward": 1.2527902126312256, "reward_std": 0.4855668544769287, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7081473469734192, "rewards/tag_count_reward/std": 0.3160606324672699, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1186.27685546875, "completions/mean_terminated_length": 885.1927490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.13510201907197272, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1238694995629794, "kl": 0.0150909423828125, "learning_rate": 9.966956724446816e-07, "loss": 0.0906, "num_tokens": 413615781.0, "reward": 1.1512277126312256, "reward_std": 0.4346998333930969, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7025669813156128, "rewards/tag_count_reward/std": 0.3125973641872406, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1231.29248046875, "completions/mean_terminated_length": 939.257568359375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.13531511373927868, "frac_reward_zero_std": 0.0, "grad_norm": 0.12399338572859975, "kl": 0.0149078369140625, "learning_rate": 9.966550546377586e-07, "loss": 0.0786, "num_tokens": 414231528.0, "reward": 1.161272406578064, "reward_std": 0.4347052574157715, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7081473469734192, "rewards/tag_count_reward/std": 0.3029598891735077, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1243.3616943359375, "completions/mean_terminated_length": 925.0155639648438, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.13552820840658464, "frac_reward_zero_std": 0.0, "grad_norm": 0.13078842583908154, "kl": 0.013336181640625, "learning_rate": 9.96614189641451e-07, "loss": 0.1098, "num_tokens": 414865738.0, "reward": 1.1272321939468384, "reward_std": 0.48820939660072327, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6540178656578064, "rewards/tag_count_reward/std": 0.32938748598098755, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1139.2366943359375, "completions/mean_terminated_length": 897.9265747070312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.13574130307389057, "frac_reward_zero_std": 0.0, "grad_norm": 0.6761653997003791, "kl": 0.0206756591796875, "learning_rate": 9.96573077478375e-07, "loss": 0.0816, "num_tokens": 415450980.0, "reward": 1.165178656578064, "reward_std": 0.3777007460594177, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.494759202003479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7410714030265808, "rewards/tag_count_reward/std": 0.30730947852134705, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1251.091552734375, "completions/mean_terminated_length": 833.6632690429688, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.13595439774119653, "frac_reward_zero_std": 0.0, "grad_norm": 0.12935028109246918, "kl": 0.01348876953125, "learning_rate": 9.96531718171283e-07, "loss": 0.1035, "num_tokens": 416084061.0, "reward": 1.1986607313156128, "reward_std": 0.4210280179977417, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6919642686843872, "rewards/tag_count_reward/std": 0.3535251319408417, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1135.51123046875, "completions/mean_terminated_length": 852.6929931640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.13616749240850248, "frac_reward_zero_std": 0.0, "grad_norm": 0.1288742480884207, "kl": 0.01458740234375, "learning_rate": 9.96490111743064e-07, "loss": 0.1407, "num_tokens": 416660226.0, "reward": 1.262834906578064, "reward_std": 0.42723309993743896, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7360491156578064, "rewards/tag_count_reward/std": 0.29481083154678345, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1210.53125, "completions/mean_terminated_length": 954.163330078125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.13638058707580844, "frac_reward_zero_std": 0.0, "grad_norm": 0.12220346223242336, "kl": 0.0132598876953125, "learning_rate": 9.96448258216744e-07, "loss": 0.0995, "num_tokens": 417271024.0, "reward": 1.2477679252624512, "reward_std": 0.4914741814136505, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.30976149439811707, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1124.0960693359375, "completions/mean_terminated_length": 844.7761840820312, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13659368174311437, "frac_reward_zero_std": 0.0, "grad_norm": 0.11928098244949728, "kl": 0.0154266357421875, "learning_rate": 9.964061576154856e-07, "loss": 0.1122, "num_tokens": 417846443.0, "reward": 1.3459821939468384, "reward_std": 0.3628884255886078, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7522321343421936, "rewards/tag_count_reward/std": 0.3137339651584625, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1043.5826416015625, "completions/mean_terminated_length": 804.964111328125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.13680677641042033, "frac_reward_zero_std": 0.0, "grad_norm": 0.13661580769527956, "kl": 0.0164337158203125, "learning_rate": 9.963638099625888e-07, "loss": 0.1131, "num_tokens": 418378000.0, "reward": 1.2661831378936768, "reward_std": 0.3823888599872589, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7728794813156128, "rewards/tag_count_reward/std": 0.29708874225616455, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1150.5179443359375, "completions/mean_terminated_length": 885.9421997070312, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.13701987107772629, "frac_reward_zero_std": 0.0, "grad_norm": 0.12445545588732315, "kl": 0.01507568359375, "learning_rate": 9.963212152814892e-07, "loss": 0.0693, "num_tokens": 418959960.0, "reward": 1.2444196939468384, "reward_std": 0.4136959910392761, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7645089030265808, "rewards/tag_count_reward/std": 0.2897607386112213, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1031.5201416015625, "completions/mean_terminated_length": 803.7841186523438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.13723296574503224, "frac_reward_zero_std": 0.0, "grad_norm": 0.1345315641334434, "kl": 0.015289306640625, "learning_rate": 9.962783735957599e-07, "loss": 0.1082, "num_tokens": 419494113.0, "reward": 1.317522406578064, "reward_std": 0.37469494342803955, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855977296829224, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7728794813156128, "rewards/tag_count_reward/std": 0.2786311209201813, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 998.169677734375, "completions/mean_terminated_length": 816.7853393554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.13744606041233817, "frac_reward_zero_std": 0.0, "grad_norm": 0.13391975107457185, "kl": 0.0179595947265625, "learning_rate": 9.962352849291106e-07, "loss": 0.124, "num_tokens": 420007533.0, "reward": 1.3856027126312256, "reward_std": 0.4255538880825043, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.27235499024391174, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1168.640625, "completions/mean_terminated_length": 879.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.13765915507964413, "frac_reward_zero_std": 0.0, "grad_norm": 0.12463116313955848, "kl": 0.0140838623046875, "learning_rate": 9.961919493053876e-07, "loss": 0.1024, "num_tokens": 420605900.0, "reward": 1.157366156578064, "reward_std": 0.34890905022621155, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7287946343421936, "rewards/tag_count_reward/std": 0.3053349256515503, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1035.0335693359375, "completions/mean_terminated_length": 853.7658081054688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1378722497469501, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13722399284732453, "kl": 0.019561767578125, "learning_rate": 9.961483667485734e-07, "loss": 0.0861, "num_tokens": 421138139.0, "reward": 1.3939732313156128, "reward_std": 0.3611336052417755, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7912946343421936, "rewards/tag_count_reward/std": 0.2871640622615814, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1148.462158203125, "completions/mean_terminated_length": 937.8264770507812, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.13808534441425604, "frac_reward_zero_std": 0.0, "grad_norm": 0.1200003729768075, "kl": 0.014678955078125, "learning_rate": 9.961045372827882e-07, "loss": 0.0774, "num_tokens": 421720426.0, "reward": 1.3264509439468384, "reward_std": 0.4491410553455353, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7639508843421936, "rewards/tag_count_reward/std": 0.3009132444858551, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1108.8035888671875, "completions/mean_terminated_length": 872.6926879882812, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.13829843908156197, "frac_reward_zero_std": 0.0, "grad_norm": 0.1199079648018838, "kl": 0.016571044921875, "learning_rate": 9.96060460932288e-07, "loss": 0.1102, "num_tokens": 422282018.0, "reward": 1.3219866752624512, "reward_std": 0.4110329747200012, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7728794813156128, "rewards/tag_count_reward/std": 0.28508007526397705, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1128.1317138671875, "completions/mean_terminated_length": 887.152099609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.13851153374886793, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12007830407921395, "kl": 0.0161590576171875, "learning_rate": 9.960161377214657e-07, "loss": 0.092, "num_tokens": 422853725.0, "reward": 1.1077009439468384, "reward_std": 0.313351035118103, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7170758843421936, "rewards/tag_count_reward/std": 0.29755479097366333, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1018.72998046875, "completions/mean_terminated_length": 828.1243286132812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1387246284161739, "frac_reward_zero_std": 0.0, "grad_norm": 0.13035382031757464, "kl": 0.01568603515625, "learning_rate": 9.959715676748508e-07, "loss": 0.0787, "num_tokens": 423376068.0, "reward": 1.3069196939468384, "reward_std": 0.44363412261009216, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7801339030265808, "rewards/tag_count_reward/std": 0.28069183230400085, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1123.0982666015625, "completions/mean_terminated_length": 822.0946655273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.13893772308347985, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13919123000043887, "kl": 0.01605224609375, "learning_rate": 9.959267508171093e-07, "loss": 0.1221, "num_tokens": 423943728.0, "reward": 1.1623884439468384, "reward_std": 0.3710832893848419, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7338169813156128, "rewards/tag_count_reward/std": 0.29799917340278625, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1267.837158203125, "completions/mean_terminated_length": 875.1375732421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.13915081775078578, "frac_reward_zero_std": 0.0, "grad_norm": 1.636419193272952, "kl": 0.077545166015625, "learning_rate": 9.958816871730442e-07, "loss": 0.1224, "num_tokens": 424587191.0, "reward": 1.087053656578064, "reward_std": 0.4358670115470886, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6986607313156128, "rewards/tag_count_reward/std": 0.336348295211792, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1133.1741943359375, "completions/mean_terminated_length": 842.5823974609375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.13936391241809173, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1244959609911837, "kl": 0.0164794921875, "learning_rate": 9.958363767675943e-07, "loss": 0.0849, "num_tokens": 425167685.0, "reward": 1.2583706378936768, "reward_std": 0.3871222138404846, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7472098469734192, "rewards/tag_count_reward/std": 0.294179230928421, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1158.602783203125, "completions/mean_terminated_length": 844.2235717773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1395770070853977, "frac_reward_zero_std": 0.0, "grad_norm": 0.12294051632959824, "kl": 0.0159149169921875, "learning_rate": 9.95790819625836e-07, "loss": 0.0517, "num_tokens": 425755043.0, "reward": 1.231584906578064, "reward_std": 0.41363105177879333, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7405133843421936, "rewards/tag_count_reward/std": 0.3102361857891083, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1186.7857666015625, "completions/mean_terminated_length": 970.279296875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.13979010175270365, "frac_reward_zero_std": 0.0, "grad_norm": 0.135112353243703, "kl": 0.019775390625, "learning_rate": 9.957450157729813e-07, "loss": 0.1379, "num_tokens": 426365235.0, "reward": 1.2209821939468384, "reward_std": 0.42713111639022827, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7790178656578064, "rewards/tag_count_reward/std": 0.2963150143623352, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1196.7857666015625, "completions/mean_terminated_length": 939.4418334960938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.14000319642000958, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13049455926595557, "kl": 0.017578125, "learning_rate": 9.95698965234379e-07, "loss": 0.0832, "num_tokens": 426967011.0, "reward": 1.2806919813156128, "reward_std": 0.4483509957790375, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7561383843421936, "rewards/tag_count_reward/std": 0.3025640547275543, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1149.5067138671875, "completions/mean_terminated_length": 877.8692016601562, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14021629108731554, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12853634277442424, "kl": 0.019256591796875, "learning_rate": 9.956526680355151e-07, "loss": 0.1195, "num_tokens": 427556262.0, "reward": 1.1969866752624512, "reward_std": 0.42253217101097107, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7438616156578064, "rewards/tag_count_reward/std": 0.3066949248313904, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1021.05810546875, "completions/mean_terminated_length": 846.7728881835938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1404293857546215, "frac_reward_zero_std": 0.0, "grad_norm": 0.13510570519071793, "kl": 0.0189208984375, "learning_rate": 9.956061242020112e-07, "loss": 0.0607, "num_tokens": 428078976.0, "reward": 1.4062501192092896, "reward_std": 0.3642091155052185, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.28867512941360474, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1139.888427734375, "completions/mean_terminated_length": 917.9055786132812, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.14064248042192745, "frac_reward_zero_std": 0.0, "grad_norm": 0.13262987804753062, "kl": 0.01873779296875, "learning_rate": 9.955593337596257e-07, "loss": 0.0913, "num_tokens": 428659838.0, "reward": 1.3716518878936768, "reward_std": 0.4764446020126343, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.2984538972377777, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1211.419677734375, "completions/mean_terminated_length": 935.8694458007812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14085557508923338, "frac_reward_zero_std": 0.0, "grad_norm": 0.38541013683293884, "kl": 0.020721435546875, "learning_rate": 9.955122967342536e-07, "loss": 0.0893, "num_tokens": 429274122.0, "reward": 1.203125, "reward_std": 0.4242902398109436, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7455357313156128, "rewards/tag_count_reward/std": 0.3105745017528534, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1206.0335693359375, "completions/mean_terminated_length": 922.02685546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.14106866975653934, "frac_reward_zero_std": 0.0, "grad_norm": 0.11643248789949381, "kl": 0.01495361328125, "learning_rate": 9.954650131519264e-07, "loss": 0.0502, "num_tokens": 429880953.0, "reward": 1.1629464626312256, "reward_std": 0.3195909559726715, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7366071343421936, "rewards/tag_count_reward/std": 0.2955472469329834, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1169.2567138671875, "completions/mean_terminated_length": 872.8447875976562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1412817644238453, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12806787224321653, "kl": 0.0146484375, "learning_rate": 9.95417483038812e-07, "loss": 0.1192, "num_tokens": 430477500.0, "reward": 1.176897406578064, "reward_std": 0.45724910497665405, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.7327008843421936, "rewards/tag_count_reward/std": 0.325735867023468, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1124.196533203125, "completions/mean_terminated_length": 868.9002685546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.14149485909115125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1188481689409286, "kl": 0.0155792236328125, "learning_rate": 9.953697064212145e-07, "loss": 0.0819, "num_tokens": 431051140.0, "reward": 1.3063616752624512, "reward_std": 0.35998496413230896, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7572544813156128, "rewards/tag_count_reward/std": 0.30253928899765015, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1222.5513916015625, "completions/mean_terminated_length": 877.7437133789062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.14170795375845718, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13900708151281793, "kl": 0.0157470703125, "learning_rate": 9.953216833255745e-07, "loss": 0.0878, "num_tokens": 431665499.0, "reward": 1.1155134439468384, "reward_std": 0.4503583014011383, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7092633843421936, "rewards/tag_count_reward/std": 0.3330056071281433, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1201.6004638671875, "completions/mean_terminated_length": 922.8160400390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.14192104842576314, "frac_reward_zero_std": 0.0, "grad_norm": 0.14679938431054437, "kl": 0.020843505859375, "learning_rate": 9.952734137784693e-07, "loss": 0.0901, "num_tokens": 432276552.0, "reward": 1.172991156578064, "reward_std": 0.36652687191963196, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7444196343421936, "rewards/tag_count_reward/std": 0.3101058900356293, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1067.90625, "completions/mean_terminated_length": 824.9303588867188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1421341430930691, "frac_reward_zero_std": 0.0, "grad_norm": 0.13679206189772566, "kl": 0.0166015625, "learning_rate": 9.952248978066123e-07, "loss": 0.1453, "num_tokens": 432824030.0, "reward": 1.21484375, "reward_std": 0.40582770109176636, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.28955546021461487, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1195.274658203125, "completions/mean_terminated_length": 904.224609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.14234723776037506, "frac_reward_zero_std": 0.0, "grad_norm": 0.13111975263057254, "kl": 0.0186767578125, "learning_rate": 9.951761354368534e-07, "loss": 0.0692, "num_tokens": 433423625.0, "reward": 1.3537946939468384, "reward_std": 0.4087453782558441, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7645089030265808, "rewards/tag_count_reward/std": 0.33413445949554443, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1120.404052734375, "completions/mean_terminated_length": 936.8690185546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.14256033242768099, "frac_reward_zero_std": 0.0, "grad_norm": 0.12232292897697075, "kl": 0.0156402587890625, "learning_rate": 9.95127126696179e-07, "loss": 0.1035, "num_tokens": 433996782.0, "reward": 1.274553656578064, "reward_std": 0.42003926634788513, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7633928656578064, "rewards/tag_count_reward/std": 0.27849724888801575, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1010.3750610351562, "completions/mean_terminated_length": 760.3102416992188, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.14277342709498694, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395553786982362, "kl": 0.017791748046875, "learning_rate": 9.950778716117116e-07, "loss": 0.1133, "num_tokens": 434517526.0, "reward": 1.26953125, "reward_std": 0.35728737711906433, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7806919813156128, "rewards/tag_count_reward/std": 0.2944888472557068, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1047.2879638671875, "completions/mean_terminated_length": 816.3544311523438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1429865217622929, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1196272266497482, "kl": 0.01806640625, "learning_rate": 9.950283702107098e-07, "loss": 0.047, "num_tokens": 435059271.0, "reward": 1.3928571939468384, "reward_std": 0.35737544298171997, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7946428656578064, "rewards/tag_count_reward/std": 0.28124499320983887, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1198.7523193359375, "completions/mean_terminated_length": 932.272705078125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.14319961642959886, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260055842613297, "kl": 0.016998291015625, "learning_rate": 9.949786225205693e-07, "loss": 0.0711, "num_tokens": 435663080.0, "reward": 1.2321429252624512, "reward_std": 0.4431600868701935, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7566964030265808, "rewards/tag_count_reward/std": 0.3118821680545807, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1073.26123046875, "completions/mean_terminated_length": 867.7756958007812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1434127110969048, "frac_reward_zero_std": 0.0, "grad_norm": 0.13601337713735054, "kl": 0.018524169921875, "learning_rate": 9.949286285688215e-07, "loss": 0.0904, "num_tokens": 436212333.0, "reward": 1.3789063692092896, "reward_std": 0.3620477616786957, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7940848469734192, "rewards/tag_count_reward/std": 0.27198344469070435, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1072.62060546875, "completions/mean_terminated_length": 854.0928955078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14362580576421075, "frac_reward_zero_std": 0.0, "grad_norm": 0.136176703848127, "kl": 0.0145111083984375, "learning_rate": 9.94878388383134e-07, "loss": 0.1493, "num_tokens": 436758611.0, "reward": 1.20703125, "reward_std": 0.4154183566570282, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7494419813156128, "rewards/tag_count_reward/std": 0.29702988266944885, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1164.8170166015625, "completions/mean_terminated_length": 914.2865600585938, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1438389004315167, "frac_reward_zero_std": 0.0, "grad_norm": 0.13341127566071212, "kl": 0.013092041015625, "learning_rate": 9.948279019913111e-07, "loss": 0.1066, "num_tokens": 437354481.0, "reward": 1.1618304252624512, "reward_std": 0.3986019492149353, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7399553656578064, "rewards/tag_count_reward/std": 0.2994394302368164, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1141.779052734375, "completions/mean_terminated_length": 817.736328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.14405199509882266, "frac_reward_zero_std": 0.0, "grad_norm": 0.8908448030934071, "kl": 0.023681640625, "learning_rate": 9.947771694212933e-07, "loss": 0.0828, "num_tokens": 437940734.0, "reward": 1.1194196939468384, "reward_std": 0.44295674562454224, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6662946343421936, "rewards/tag_count_reward/std": 0.32764512300491333, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1057.8148193359375, "completions/mean_terminated_length": 832.6493530273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1442650897661286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1345079693676448, "kl": 0.015411376953125, "learning_rate": 9.947261907011568e-07, "loss": 0.1195, "num_tokens": 438485115.0, "reward": 1.2154018878936768, "reward_std": 0.4091176390647888, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.3114555776119232, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1218.810302734375, "completions/mean_terminated_length": 961.8099365234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.14447818443343455, "frac_reward_zero_std": 0.0, "grad_norm": 0.11610939918420013, "kl": 0.013336181640625, "learning_rate": 9.946749658591147e-07, "loss": 0.1242, "num_tokens": 439104230.0, "reward": 1.1869419813156128, "reward_std": 0.4558294415473938, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7472098469734192, "rewards/tag_count_reward/std": 0.3085617125034332, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1103.5357666015625, "completions/mean_terminated_length": 925.665771484375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1446912791007405, "frac_reward_zero_std": 0.0, "grad_norm": 0.24679484703237348, "kl": 0.017578125, "learning_rate": 9.946234949235159e-07, "loss": 0.1552, "num_tokens": 439673126.0, "reward": 1.2388393878936768, "reward_std": 0.4114640951156616, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7522321343421936, "rewards/tag_count_reward/std": 0.29251575469970703, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1076.08935546875, "completions/mean_terminated_length": 858.3387451171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14490437376804646, "frac_reward_zero_std": 0.0, "grad_norm": 0.12698216031062878, "kl": 0.0137939453125, "learning_rate": 9.945717779228458e-07, "loss": 0.1167, "num_tokens": 440224462.0, "reward": 1.1127232313156128, "reward_std": 0.41351184248924255, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7466517686843872, "rewards/tag_count_reward/std": 0.29771679639816284, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1216.7701416015625, "completions/mean_terminated_length": 873.2650146484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1451174684353524, "frac_reward_zero_std": 0.0, "grad_norm": 0.12372034551200503, "kl": 0.0140533447265625, "learning_rate": 9.945198148857257e-07, "loss": 0.1462, "num_tokens": 440839079.0, "reward": 1.2103794813156128, "reward_std": 0.46855056285858154, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7237723469734192, "rewards/tag_count_reward/std": 0.31686145067214966, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1153.51123046875, "completions/mean_terminated_length": 858.88720703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14533056310265835, "frac_reward_zero_std": 0.0, "grad_norm": 0.12119071938007452, "kl": 0.0131683349609375, "learning_rate": 9.944676058409132e-07, "loss": 0.1452, "num_tokens": 441432588.0, "reward": 1.1032366752624512, "reward_std": 0.4475897252559662, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7059151530265808, "rewards/tag_count_reward/std": 0.31308820843696594, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1022.5045166015625, "completions/mean_terminated_length": 771.8278198242188, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1455436577699643, "frac_reward_zero_std": 0.0, "grad_norm": 0.134775360114139, "kl": 0.016876220703125, "learning_rate": 9.944151508173017e-07, "loss": 0.1258, "num_tokens": 441957806.0, "reward": 1.23046875, "reward_std": 0.3669811189174652, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7349330186843872, "rewards/tag_count_reward/std": 0.31978318095207214, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1216.10498046875, "completions/mean_terminated_length": 883.3468627929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.14575675243727026, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11850368954786407, "kl": 0.013824462890625, "learning_rate": 9.943624498439214e-07, "loss": 0.0948, "num_tokens": 442571677.0, "reward": 1.1774554252624512, "reward_std": 0.46347200870513916, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7511160969734192, "rewards/tag_count_reward/std": 0.29584914445877075, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1017.0714721679688, "completions/mean_terminated_length": 854.5736694335938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1459698471045762, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13327447595877517, "kl": 0.0148773193359375, "learning_rate": 9.943095029499382e-07, "loss": 0.0982, "num_tokens": 443098621.0, "reward": 1.2181919813156128, "reward_std": 0.3946465849876404, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.2607157230377197, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1103.5223388671875, "completions/mean_terminated_length": 859.44384765625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.14618294177188215, "frac_reward_zero_std": 0.0, "grad_norm": 0.13797045192875015, "kl": 0.0156402587890625, "learning_rate": 9.942563101646544e-07, "loss": 0.1473, "num_tokens": 443665735.0, "reward": 1.2371652126312256, "reward_std": 0.4235488176345825, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7527901530265808, "rewards/tag_count_reward/std": 0.29795730113983154, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1197.94873046875, "completions/mean_terminated_length": 861.635498046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1463960364391881, "frac_reward_zero_std": 0.0, "grad_norm": 0.13630735009608785, "kl": 0.015106201171875, "learning_rate": 9.942028715175076e-07, "loss": 0.1282, "num_tokens": 444270976.0, "reward": 1.1841518878936768, "reward_std": 0.45379647612571716, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.33310166001319885, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 989.3817138671875, "completions/mean_terminated_length": 786.6675415039062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14660913110649407, "frac_reward_zero_std": 0.0, "grad_norm": 0.1335243266387221, "kl": 0.015411376953125, "learning_rate": 9.94149187038072e-07, "loss": 0.0808, "num_tokens": 444781211.0, "reward": 1.2371652126312256, "reward_std": 0.4472712278366089, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7840401530265808, "rewards/tag_count_reward/std": 0.2677413523197174, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1032.8304443359375, "completions/mean_terminated_length": 835.2106323242188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1468222257738, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1317542465611623, "kl": 0.0149383544921875, "learning_rate": 9.940952567560585e-07, "loss": 0.0915, "num_tokens": 445320399.0, "reward": 1.2209821939468384, "reward_std": 0.37385305762290955, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7901785969734192, "rewards/tag_count_reward/std": 0.28926268219947815, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 925.7701416015625, "completions/mean_terminated_length": 787.952392578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.14703532044110595, "frac_reward_zero_std": 0.0, "grad_norm": 0.13703578094580346, "kl": 0.01751708984375, "learning_rate": 9.940410807013129e-07, "loss": 0.0401, "num_tokens": 445805864.0, "reward": 1.2974331378936768, "reward_std": 0.33549997210502625, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8041294813156128, "rewards/tag_count_reward/std": 0.26174795627593994, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 903.7656860351562, "completions/mean_terminated_length": 719.9766845703125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1472484151084119, "frac_reward_zero_std": 0.0, "grad_norm": 0.1437043969359983, "kl": 0.017486572265625, "learning_rate": 9.939866589038172e-07, "loss": 0.0918, "num_tokens": 446277695.0, "reward": 1.3286831378936768, "reward_std": 0.34816738963127136, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7974330186843872, "rewards/tag_count_reward/std": 0.2864561975002289, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1056.76123046875, "completions/mean_terminated_length": 817.8753051757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14746150977571787, "frac_reward_zero_std": 0.0, "grad_norm": 0.13240029643150733, "kl": 0.015625, "learning_rate": 9.939319913936906e-07, "loss": 0.0807, "num_tokens": 446823428.0, "reward": 1.234375, "reward_std": 0.44145601987838745, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7700892686843872, "rewards/tag_count_reward/std": 0.3003323972225189, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 892.4129638671875, "completions/mean_terminated_length": 747.2387084960938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.14767460444302383, "frac_reward_zero_std": 0.0, "grad_norm": 0.1363330965749457, "kl": 0.019622802734375, "learning_rate": 9.938770782011864e-07, "loss": 0.0399, "num_tokens": 447284013.0, "reward": 1.3727679252624512, "reward_std": 0.3388535678386688, "rewards/accuracy_reward/mean": 0.5555555820465088, "rewards/accuracy_reward/std": 0.4974800646305084, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8370535969734192, "rewards/tag_count_reward/std": 0.25296854972839355, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 929.7723388671875, "completions/mean_terminated_length": 779.7316284179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14788769911032976, "frac_reward_zero_std": 0.0, "grad_norm": 0.13918389062211142, "kl": 0.017791748046875, "learning_rate": 9.938219193566956e-07, "loss": 0.0554, "num_tokens": 447767015.0, "reward": 1.4681919813156128, "reward_std": 0.37598365545272827, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8297991156578064, "rewards/tag_count_reward/std": 0.26157617568969727, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 994.7366333007812, "completions/mean_terminated_length": 838.0974731445312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.14810079377763571, "frac_reward_zero_std": 0.0, "grad_norm": 0.13991430281370598, "kl": 0.0179901123046875, "learning_rate": 9.93766514890744e-07, "loss": 0.0429, "num_tokens": 448286225.0, "reward": 1.2840402126312256, "reward_std": 0.366243451833725, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7974330186843872, "rewards/tag_count_reward/std": 0.2603755295276642, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1097.388427734375, "completions/mean_terminated_length": 896.9891967773438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.14831388844494167, "frac_reward_zero_std": 0.0, "grad_norm": 0.11412828181744235, "kl": 0.0153961181640625, "learning_rate": 9.937108648339939e-07, "loss": 0.0436, "num_tokens": 448841695.0, "reward": 1.2678571939468384, "reward_std": 0.45982861518859863, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8102678656578064, "rewards/tag_count_reward/std": 0.2722134292125702, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1125.7523193359375, "completions/mean_terminated_length": 864.1404418945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14852698311224763, "frac_reward_zero_std": 0.0, "grad_norm": 0.1345371092731451, "kl": 0.0174713134765625, "learning_rate": 9.936549692172433e-07, "loss": 0.1495, "num_tokens": 449411200.0, "reward": 1.2645089626312256, "reward_std": 0.4366303086280823, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.30836185812950134, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 950.1920166015625, "completions/mean_terminated_length": 777.1524658203125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.14874007777955356, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1284.670829674342, "kl": 29.88507080078125, "learning_rate": 9.93598828071426e-07, "loss": 1.2511, "num_tokens": 449910246.0, "reward": 1.4023438692092896, "reward_std": 0.33358901739120483, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8286830186843872, "rewards/tag_count_reward/std": 0.2744280993938446, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1084.5179443359375, "completions/mean_terminated_length": 807.6551513671875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.14895317244685952, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12905867254968686, "kl": 0.01666259765625, "learning_rate": 9.93542441427612e-07, "loss": 0.1086, "num_tokens": 450466494.0, "reward": 1.1863839626312256, "reward_std": 0.3764786124229431, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7712053656578064, "rewards/tag_count_reward/std": 0.2969778776168823, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1123.6004638671875, "completions/mean_terminated_length": 878.138427734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.14916626711416547, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13386271170882727, "kl": 0.017059326171875, "learning_rate": 9.934858093170071e-07, "loss": 0.1489, "num_tokens": 451039067.0, "reward": 1.4129464626312256, "reward_std": 0.4689089059829712, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7857142686843872, "rewards/tag_count_reward/std": 0.3161519467830658, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1037.99560546875, "completions/mean_terminated_length": 866.5848999023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.14937936178147143, "frac_reward_zero_std": 0.0, "grad_norm": 0.11442320032859457, "kl": 0.01531982421875, "learning_rate": 9.934289317709526e-07, "loss": 0.0855, "num_tokens": 451575849.0, "reward": 1.4648438692092896, "reward_std": 0.3985152542591095, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8443080186843872, "rewards/tag_count_reward/std": 0.27356699109077454, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 999.7277221679688, "completions/mean_terminated_length": 771.8424072265625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.14959245644877736, "frac_reward_zero_std": 0.0, "grad_norm": 1.6945461007664862, "kl": 0.0181427001953125, "learning_rate": 9.93371808820926e-07, "loss": 0.0832, "num_tokens": 452092255.0, "reward": 1.4017857313156128, "reward_std": 0.4318530559539795, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8191964030265808, "rewards/tag_count_reward/std": 0.29340213537216187, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 990.8951416015625, "completions/mean_terminated_length": 833.6846313476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.14980555111608332, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11993278057004805, "kl": 0.01715087890625, "learning_rate": 9.933144404985405e-07, "loss": 0.0949, "num_tokens": 452604560.0, "reward": 1.3828126192092896, "reward_std": 0.35533708333969116, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8113839030265808, "rewards/tag_count_reward/std": 0.2750307619571686, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1062.774658203125, "completions/mean_terminated_length": 828.7155151367188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.15001864578338928, "frac_reward_zero_std": 0.0, "grad_norm": 0.14157479382569624, "kl": 0.0167083740234375, "learning_rate": 9.932568268355448e-07, "loss": 0.1253, "num_tokens": 453158187.0, "reward": 1.1501116752624512, "reward_std": 0.4373627007007599, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7639508843421936, "rewards/tag_count_reward/std": 0.3123137056827545, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 959.9732666015625, "completions/mean_terminated_length": 778.6354370117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15023174045069523, "frac_reward_zero_std": 0.0, "grad_norm": 0.13468308929565764, "kl": 0.0194091796875, "learning_rate": 9.93198967863824e-07, "loss": 0.1071, "num_tokens": 453655455.0, "reward": 1.3465402126312256, "reward_std": 0.411983847618103, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8018973469734192, "rewards/tag_count_reward/std": 0.2792443037033081, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1068.390625, "completions/mean_terminated_length": 848.915283203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15044483511800116, "frac_reward_zero_std": 0.0, "grad_norm": 0.14071724135958796, "kl": 0.0152587890625, "learning_rate": 9.931408636153984e-07, "loss": 0.1387, "num_tokens": 454209166.0, "reward": 1.203125, "reward_std": 0.41122350096702576, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7723214030265808, "rewards/tag_count_reward/std": 0.3107031285762787, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1065.734375, "completions/mean_terminated_length": 787.0974731445312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.15065792978530712, "frac_reward_zero_std": 0.0, "grad_norm": 0.14508417827500644, "kl": 0.01678466796875, "learning_rate": 9.930825141224242e-07, "loss": 0.0973, "num_tokens": 454750295.0, "reward": 1.29296875, "reward_std": 0.32834291458129883, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7728794813156128, "rewards/tag_count_reward/std": 0.3022213876247406, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1098.12060546875, "completions/mean_terminated_length": 865.9277954101562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.15087102445261308, "frac_reward_zero_std": 0.0, "grad_norm": 0.13611513176707363, "kl": 0.016754150390625, "learning_rate": 9.930239194171937e-07, "loss": 0.1119, "num_tokens": 455309853.0, "reward": 1.2868304252624512, "reward_std": 0.4366273283958435, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7935267686843872, "rewards/tag_count_reward/std": 0.27539366483688354, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1066.9107666015625, "completions/mean_terminated_length": 816.8291625976562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.15108411911991904, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14360486659779728, "kl": 0.0167999267578125, "learning_rate": 9.929650795321344e-07, "loss": 0.1054, "num_tokens": 455862181.0, "reward": 1.3577009439468384, "reward_std": 0.3859207034111023, "rewards/accuracy_reward/mean": 0.5671296119689941, "rewards/accuracy_reward/std": 0.49604758620262146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8108258843421936, "rewards/tag_count_reward/std": 0.2863602936267853, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 966.10498046875, "completions/mean_terminated_length": 795.5736694335938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.15129721378722497, "frac_reward_zero_std": 0.0, "grad_norm": 0.14821180553149105, "kl": 0.02020263671875, "learning_rate": 9.929059944998096e-07, "loss": 0.0871, "num_tokens": 456360868.0, "reward": 1.4977679252624512, "reward_std": 0.38166478276252747, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8504464030265808, "rewards/tag_count_reward/std": 0.25897711515426636, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1044.3616943359375, "completions/mean_terminated_length": 848.9866333007812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.15151030845453092, "frac_reward_zero_std": 0.0, "grad_norm": 0.15104417443856227, "kl": 0.017364501953125, "learning_rate": 9.928466643529185e-07, "loss": 0.1025, "num_tokens": 456898198.0, "reward": 1.3515626192092896, "reward_std": 0.43090930581092834, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8180803656578064, "rewards/tag_count_reward/std": 0.28051385283470154, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1136.40625, "completions/mean_terminated_length": 922.9476928710938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.15172340312183688, "frac_reward_zero_std": 0.0, "grad_norm": 0.12324357361636927, "kl": 0.0167083740234375, "learning_rate": 9.927870891242956e-07, "loss": 0.0962, "num_tokens": 457475164.0, "reward": 1.3521206378936768, "reward_std": 0.37399131059646606, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7963169813156128, "rewards/tag_count_reward/std": 0.28270989656448364, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1073.0670166015625, "completions/mean_terminated_length": 834.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15193649778914284, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13059621304588653, "kl": 0.0170135498046875, "learning_rate": 9.927272688469115e-07, "loss": 0.0954, "num_tokens": 458027978.0, "reward": 1.301897406578064, "reward_std": 0.3749329745769501, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7818080186843872, "rewards/tag_count_reward/std": 0.32030200958251953, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1022.8995971679688, "completions/mean_terminated_length": 867.4215698242188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.15214959245644877, "frac_reward_zero_std": 0.0, "grad_norm": 0.13191018403219468, "kl": 0.018768310546875, "learning_rate": 9.926672035538716e-07, "loss": 0.1032, "num_tokens": 458550429.0, "reward": 1.3470982313156128, "reward_std": 0.3803008794784546, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8158482313156128, "rewards/tag_count_reward/std": 0.2667539417743683, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1229.44873046875, "completions/mean_terminated_length": 923.11962890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15236268712375473, "frac_reward_zero_std": 0.0, "grad_norm": 0.12096691964925119, "kl": 0.01397705078125, "learning_rate": 9.926068932784182e-07, "loss": 0.1473, "num_tokens": 459181046.0, "reward": 1.1891741752624512, "reward_std": 0.4436335861682892, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7606026530265808, "rewards/tag_count_reward/std": 0.3124455511569977, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1063.6942138671875, "completions/mean_terminated_length": 865.7775268554688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.15257578179106068, "frac_reward_zero_std": 0.0, "grad_norm": 0.15111345566286427, "kl": 0.01824951171875, "learning_rate": 9.92546338053928e-07, "loss": 0.1155, "num_tokens": 459722157.0, "reward": 1.328125, "reward_std": 0.4215332567691803, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.28088077902793884, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1286.83935546875, "completions/mean_terminated_length": 995.5308837890625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.15278887645836664, "frac_reward_zero_std": 0.0, "grad_norm": 0.12152851655631211, "kl": 0.01513671875, "learning_rate": 9.924855379139136e-07, "loss": 0.0678, "num_tokens": 460373573.0, "reward": 1.1037946939468384, "reward_std": 0.4427836537361145, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399619221687317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7645089030265808, "rewards/tag_count_reward/std": 0.3187141716480255, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1154.8973388671875, "completions/mean_terminated_length": 917.7457885742188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.15300197112567257, "frac_reward_zero_std": 0.0, "grad_norm": 0.12879353434402135, "kl": 0.018646240234375, "learning_rate": 9.924244928920232e-07, "loss": 0.0832, "num_tokens": 460959703.0, "reward": 1.2075893878936768, "reward_std": 0.3688276708126068, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7611607313156128, "rewards/tag_count_reward/std": 0.29939982295036316, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1234.384033203125, "completions/mean_terminated_length": 969.59765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15321506579297853, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11247017455206236, "kl": 0.0162506103515625, "learning_rate": 9.923632030220411e-07, "loss": 0.1086, "num_tokens": 461588083.0, "reward": 1.2399554252624512, "reward_std": 0.42860835790634155, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7555803656578064, "rewards/tag_count_reward/std": 0.31811821460723877, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1195.759033203125, "completions/mean_terminated_length": 925.0470581054688, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.15342816046028449, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13108526378125848, "kl": 0.0157318115234375, "learning_rate": 9.923016683378858e-07, "loss": 0.1299, "num_tokens": 462193287.0, "reward": 1.3013393878936768, "reward_std": 0.43348729610443115, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7834821343421936, "rewards/tag_count_reward/std": 0.3137339651584625, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1172.5, "completions/mean_terminated_length": 911.118896484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15364125512759044, "frac_reward_zero_std": 0.0, "grad_norm": 0.13039838088176925, "kl": 0.0148162841796875, "learning_rate": 9.922398888736125e-07, "loss": 0.1138, "num_tokens": 462789559.0, "reward": 1.2985491752624512, "reward_std": 0.4117524325847626, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8030133843421936, "rewards/tag_count_reward/std": 0.2974666655063629, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1093.1116943359375, "completions/mean_terminated_length": 885.5271606445312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.15385434979489637, "frac_reward_zero_std": 0.0, "grad_norm": 0.12760346976845133, "kl": 0.018646240234375, "learning_rate": 9.921778646634114e-07, "loss": 0.0534, "num_tokens": 463350105.0, "reward": 1.3978794813156128, "reward_std": 0.37394532561302185, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8309151530265808, "rewards/tag_count_reward/std": 0.27224037051200867, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1096.8817138671875, "completions/mean_terminated_length": 880.6000366210938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.15406744446220233, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13402977516786121, "kl": 0.018707275390625, "learning_rate": 9.921155957416078e-07, "loss": 0.1012, "num_tokens": 463904276.0, "reward": 1.4380581378936768, "reward_std": 0.4119695723056793, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.28169241547584534, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1124.169677734375, "completions/mean_terminated_length": 865.4971313476562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1542805391295083, "frac_reward_zero_std": 0.0, "grad_norm": 0.19897404578965877, "kl": 0.01806640625, "learning_rate": 9.92053082142663e-07, "loss": 0.1208, "num_tokens": 464478272.0, "reward": 1.2561384439468384, "reward_std": 0.4223388433456421, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7940848469734192, "rewards/tag_count_reward/std": 0.30949485301971436, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1089.118408203125, "completions/mean_terminated_length": 861.3176879882812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15449363379681424, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13392246930947885, "kl": 0.016998291015625, "learning_rate": 9.919903239011737e-07, "loss": 0.102, "num_tokens": 465028165.0, "reward": 1.3314732313156128, "reward_std": 0.35138410329818726, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8158482313156128, "rewards/tag_count_reward/std": 0.27039816975593567, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1188.8616943359375, "completions/mean_terminated_length": 922.5789794921875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.15470672846412017, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12228055305980295, "kl": 0.0157623291015625, "learning_rate": 9.919273210518715e-07, "loss": 0.0876, "num_tokens": 465624967.0, "reward": 1.3666294813156128, "reward_std": 0.4116423726081848, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7907366156578064, "rewards/tag_count_reward/std": 0.32146915793418884, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1089.337158203125, "completions/mean_terminated_length": 844.9719848632812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15491982313142613, "frac_reward_zero_std": 0.0, "grad_norm": 0.14355790277747807, "kl": 0.01849365234375, "learning_rate": 9.91864073629624e-07, "loss": 0.1357, "num_tokens": 466177694.0, "reward": 1.4051339626312256, "reward_std": 0.43526557087898254, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8113839030265808, "rewards/tag_count_reward/std": 0.3090250790119171, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 957.9710083007812, "completions/mean_terminated_length": 830.2119750976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1551329177987321, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13402672741771077, "kl": 0.021240234375, "learning_rate": 9.918005816694333e-07, "loss": 0.0655, "num_tokens": 466678049.0, "reward": 1.3850446939468384, "reward_std": 0.3447449803352356, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8448660969734192, "rewards/tag_count_reward/std": 0.24845468997955322, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1354.540283203125, "completions/mean_terminated_length": 1022.6865234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15534601246603805, "frac_reward_zero_std": 0.0, "grad_norm": 0.4201026315881484, "kl": 0.0129852294921875, "learning_rate": 9.917368452064377e-07, "loss": 0.0896, "num_tokens": 467367571.0, "reward": 1.114397406578064, "reward_std": 0.49074214696884155, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7572544813156128, "rewards/tag_count_reward/std": 0.33832037448883057, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1094.9866943359375, "completions/mean_terminated_length": 803.2478637695312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15555910713334398, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12633936669659696, "kl": 0.0172576904296875, "learning_rate": 9.916728642759102e-07, "loss": 0.1436, "num_tokens": 467925741.0, "reward": 1.2801339626312256, "reward_std": 0.3484698235988617, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8180803656578064, "rewards/tag_count_reward/std": 0.29031166434288025, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 872.591552734375, "completions/mean_terminated_length": 757.3554077148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15577220180064993, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14002578637452331, "kl": 0.021575927734375, "learning_rate": 9.916086389132597e-07, "loss": 0.067, "num_tokens": 468390854.0, "reward": 1.6194196939468384, "reward_std": 0.3139893114566803, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44215917587280273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.22906675934791565, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1100.2679443359375, "completions/mean_terminated_length": 862.0111694335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1559852964679559, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11748074405754115, "kl": 0.01751708984375, "learning_rate": 9.915441691540297e-07, "loss": 0.0879, "num_tokens": 468953918.0, "reward": 1.3476563692092896, "reward_std": 0.33276113867759705, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.31494081020355225, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 887.82373046875, "completions/mean_terminated_length": 725.4580078125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.15619839113526185, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15928226123459177, "kl": 0.022308349609375, "learning_rate": 9.914794550338994e-07, "loss": 0.071, "num_tokens": 469421503.0, "reward": 1.551897406578064, "reward_std": 0.30738532543182373, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8844866156578064, "rewards/tag_count_reward/std": 0.24897858500480652, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1078.8013916015625, "completions/mean_terminated_length": 838.5264282226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.15641148580256778, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12418531600802606, "kl": 0.0156097412109375, "learning_rate": 9.914144965886833e-07, "loss": 0.084, "num_tokens": 469975558.0, "reward": 1.1891741752624512, "reward_std": 0.31078848242759705, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.48164102435112, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8253348469734192, "rewards/tag_count_reward/std": 0.28337591886520386, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1003.5826416015625, "completions/mean_terminated_length": 832.6779174804688, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.15662458046987374, "frac_reward_zero_std": 0.0, "grad_norm": 0.14321156310072328, "kl": 0.02032470703125, "learning_rate": 9.913492938543305e-07, "loss": 0.0912, "num_tokens": 470494859.0, "reward": 1.4285714626312256, "reward_std": 0.4269440770149231, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8660714030265808, "rewards/tag_count_reward/std": 0.2528994381427765, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1023.3750610351562, "completions/mean_terminated_length": 830.408447265625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1568376751371797, "frac_reward_zero_std": 0.0, "grad_norm": 0.14141820733342655, "kl": 0.0167236328125, "learning_rate": 9.91283846866926e-07, "loss": 0.1505, "num_tokens": 471021283.0, "reward": 1.3454241752624512, "reward_std": 0.4020189642906189, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8543526530265808, "rewards/tag_count_reward/std": 0.25877881050109863, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1037.341552734375, "completions/mean_terminated_length": 800.6859741210938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15705076980448565, "frac_reward_zero_std": 0.0, "grad_norm": 0.12850274220007044, "kl": 0.0179443359375, "learning_rate": 9.912181556626896e-07, "loss": 0.0734, "num_tokens": 471560428.0, "reward": 1.4743304252624512, "reward_std": 0.4303416311740875, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.25071555376052856, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1114.390625, "completions/mean_terminated_length": 814.2035522460938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.15726386447179158, "frac_reward_zero_std": 0.0, "grad_norm": 0.13384207113838262, "kl": 0.0174560546875, "learning_rate": 9.911522202779766e-07, "loss": 0.1161, "num_tokens": 472125755.0, "reward": 1.2723214626312256, "reward_std": 0.3943602740764618, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7946428656578064, "rewards/tag_count_reward/std": 0.30000796914100647, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1147.52685546875, "completions/mean_terminated_length": 882.0693359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15747695913909754, "frac_reward_zero_std": 0.0, "grad_norm": 0.12572316341902748, "kl": 0.015289306640625, "learning_rate": 9.910860407492768e-07, "loss": 0.1114, "num_tokens": 472713415.0, "reward": 1.2712054252624512, "reward_std": 0.39238712191581726, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.30665677785873413, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 950.2388916015625, "completions/mean_terminated_length": 743.4986572265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1576900538064035, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11960700352863632, "kl": 0.01824951171875, "learning_rate": 9.910196171132157e-07, "loss": 0.1202, "num_tokens": 473203282.0, "reward": 1.4185268878936768, "reward_std": 0.4130321443080902, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8649553656578064, "rewards/tag_count_reward/std": 0.2517491281032562, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1051.40625, "completions/mean_terminated_length": 838.0433959960938, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.15790314847370945, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13473806243376207, "kl": 0.017364501953125, "learning_rate": 9.90952949406554e-07, "loss": 0.1287, "num_tokens": 473747640.0, "reward": 1.3431919813156128, "reward_std": 0.4228634536266327, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8454241156578064, "rewards/tag_count_reward/std": 0.2762327194213867, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1029.352783203125, "completions/mean_terminated_length": 790.8264770507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15811624314101538, "frac_reward_zero_std": 0.0, "grad_norm": 0.12925176864888338, "kl": 0.018096923828125, "learning_rate": 9.908860376661865e-07, "loss": 0.097, "num_tokens": 474275606.0, "reward": 1.3671876192092896, "reward_std": 0.40404853224754333, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.263893187046051, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1081.4888916015625, "completions/mean_terminated_length": 855.1708374023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.15832933780832134, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12945618341875803, "kl": 0.0189208984375, "learning_rate": 9.908188819291442e-07, "loss": 0.1244, "num_tokens": 474833105.0, "reward": 1.4520089626312256, "reward_std": 0.42613303661346436, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8694196343421936, "rewards/tag_count_reward/std": 0.2562901973724365, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1025.040283203125, "completions/mean_terminated_length": 822.6363525390625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1585424324756273, "frac_reward_zero_std": 0.0, "grad_norm": 0.14299017438568432, "kl": 0.018035888671875, "learning_rate": 9.907514822325928e-07, "loss": 0.0939, "num_tokens": 475362227.0, "reward": 1.1930804252624512, "reward_std": 0.40716296434402466, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8470982313156128, "rewards/tag_count_reward/std": 0.27741706371307373, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1211.399658203125, "completions/mean_terminated_length": 919.0933227539062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.15875552714293326, "frac_reward_zero_std": 0.0, "grad_norm": 0.13378513087902502, "kl": 0.0160980224609375, "learning_rate": 9.906838386138324e-07, "loss": 0.1171, "num_tokens": 475971478.0, "reward": 1.3013393878936768, "reward_std": 0.3755875527858734, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.29353824257850647, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1019.8638916015625, "completions/mean_terminated_length": 851.6233520507812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.15896862181023919, "frac_reward_zero_std": 0.0, "grad_norm": 0.12413652172631166, "kl": 0.0186767578125, "learning_rate": 9.90615951110299e-07, "loss": 0.0866, "num_tokens": 476500681.0, "reward": 1.4765626192092896, "reward_std": 0.4106946885585785, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8604910969734192, "rewards/tag_count_reward/std": 0.2602730095386505, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 993.6607666015625, "completions/mean_terminated_length": 774.8355712890625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.15918171647754514, "frac_reward_zero_std": 0.0, "grad_norm": 0.14079078597187705, "kl": 0.018707275390625, "learning_rate": 9.905478197595628e-07, "loss": 0.0752, "num_tokens": 477019553.0, "reward": 1.3783482313156128, "reward_std": 0.35385826230049133, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2660040855407715, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1115.1138916015625, "completions/mean_terminated_length": 912.3125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1593948111448511, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12223112834888018, "kl": 0.0166473388671875, "learning_rate": 9.904794445993294e-07, "loss": 0.0789, "num_tokens": 477595748.0, "reward": 1.2868304252624512, "reward_std": 0.3850972354412079, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8270089030265808, "rewards/tag_count_reward/std": 0.2684706449508667, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1117.9107666015625, "completions/mean_terminated_length": 896.9503173828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.15960790581215706, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1234044162058948, "kl": 0.0186767578125, "learning_rate": 9.904108256674394e-07, "loss": 0.1167, "num_tokens": 478161084.0, "reward": 1.4559152126312256, "reward_std": 0.4311186373233795, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8621651530265808, "rewards/tag_count_reward/std": 0.2749870717525482, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 983.5223388671875, "completions/mean_terminated_length": 837.62939453125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.159821000479463, "frac_reward_zero_std": 0.0, "grad_norm": 0.144074280425053, "kl": 0.023834228515625, "learning_rate": 9.903419630018676e-07, "loss": 0.0948, "num_tokens": 478665526.0, "reward": 1.4860491752624512, "reward_std": 0.35792022943496704, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8856026530265808, "rewards/tag_count_reward/std": 0.24152126908302307, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1047.388427734375, "completions/mean_terminated_length": 836.4486694335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16003409514676895, "frac_reward_zero_std": 0.0, "grad_norm": 0.13387868305106246, "kl": 0.019073486328125, "learning_rate": 9.902728566407248e-07, "loss": 0.0782, "num_tokens": 479206916.0, "reward": 1.4213169813156128, "reward_std": 0.4172287583351135, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8610491156578064, "rewards/tag_count_reward/std": 0.26137566566467285, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1201.6295166015625, "completions/mean_terminated_length": 916.1373291015625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1602471898140749, "frac_reward_zero_std": 0.0, "grad_norm": 0.12034827622371361, "kl": 0.0143890380859375, "learning_rate": 9.90203506622256e-07, "loss": 0.1101, "num_tokens": 479820734.0, "reward": 1.165178656578064, "reward_std": 0.41876834630966187, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8236607313156128, "rewards/tag_count_reward/std": 0.2989324629306793, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1060.671875, "completions/mean_terminated_length": 822.728515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.16046028448138086, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13125703208306563, "kl": 0.01751708984375, "learning_rate": 9.90133912984841e-07, "loss": 0.0581, "num_tokens": 480364011.0, "reward": 1.3521206378936768, "reward_std": 0.3559810519218445, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.2688673436641693, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1127.654052734375, "completions/mean_terminated_length": 869.9571533203125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.16067337914868682, "frac_reward_zero_std": 0.0, "grad_norm": 0.14100222889528688, "kl": 0.017333984375, "learning_rate": 9.900640757669943e-07, "loss": 0.0981, "num_tokens": 480938480.0, "reward": 1.321428656578064, "reward_std": 0.3799952268600464, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.26918813586235046, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1179.7545166015625, "completions/mean_terminated_length": 982.3178100585938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16088647381599275, "frac_reward_zero_std": 0.0, "grad_norm": 0.12578606899914582, "kl": 0.017486572265625, "learning_rate": 9.899939950073658e-07, "loss": 0.1053, "num_tokens": 481537154.0, "reward": 1.4157366752624512, "reward_std": 0.4477072060108185, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8643973469734192, "rewards/tag_count_reward/std": 0.27508240938186646, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1008.2545166015625, "completions/mean_terminated_length": 785.6531372070312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1610995684832987, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13284076978290218, "kl": 0.017913818359375, "learning_rate": 9.899236707447399e-07, "loss": 0.0999, "num_tokens": 482053652.0, "reward": 1.3543527126312256, "reward_std": 0.34607964754104614, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.28354763984680176, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1031.904052734375, "completions/mean_terminated_length": 807.64306640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16131266315060466, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14028527464326643, "kl": 0.02081298828125, "learning_rate": 9.898531030180353e-07, "loss": 0.0651, "num_tokens": 482588841.0, "reward": 1.3867188692092896, "reward_std": 0.4294005334377289, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8510044813156128, "rewards/tag_count_reward/std": 0.28425124287605286, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1086.419677734375, "completions/mean_terminated_length": 851.36669921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16152575781791062, "frac_reward_zero_std": 0.0, "grad_norm": 0.21480122121295042, "kl": 0.0209197998046875, "learning_rate": 9.897822918663062e-07, "loss": 0.1047, "num_tokens": 483150101.0, "reward": 1.2918527126312256, "reward_std": 0.3935893774032593, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8342633843421936, "rewards/tag_count_reward/std": 0.2882158160209656, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 982.7723388671875, "completions/mean_terminated_length": 768.58447265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16173885248521655, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14896821734820667, "kl": 0.019195556640625, "learning_rate": 9.89711237328741e-07, "loss": 0.1176, "num_tokens": 483673471.0, "reward": 1.2806919813156128, "reward_std": 0.38954755663871765, "rewards/accuracy_reward/mean": 0.48317307233810425, "rewards/accuracy_reward/std": 0.5003184676170349, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2969752550125122, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 976.07373046875, "completions/mean_terminated_length": 780.9208984375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1619519471525225, "frac_reward_zero_std": 0.0, "grad_norm": 0.11987655576536607, "kl": 0.022247314453125, "learning_rate": 9.896399394446628e-07, "loss": 0.0776, "num_tokens": 484182176.0, "reward": 1.4001116752624512, "reward_std": 0.3662990927696228, "rewards/accuracy_reward/mean": 0.5324074029922485, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23972615599632263, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 991.4910888671875, "completions/mean_terminated_length": 782.44921875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.16216504181982846, "frac_reward_zero_std": 0.0, "grad_norm": 0.1376876735088269, "kl": 0.018768310546875, "learning_rate": 9.895683982535298e-07, "loss": 0.0981, "num_tokens": 484696636.0, "reward": 1.4146206378936768, "reward_std": 0.36274465918540955, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.27093952894210815, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 962.5692138671875, "completions/mean_terminated_length": 810.6641235351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16237813648713442, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14300657442732245, "kl": 0.020263671875, "learning_rate": 9.894966137949346e-07, "loss": 0.1081, "num_tokens": 485200155.0, "reward": 1.4637277126312256, "reward_std": 0.3754221200942993, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8900669813156128, "rewards/tag_count_reward/std": 0.2492642104625702, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1195.638427734375, "completions/mean_terminated_length": 854.6937866210938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16259123115444035, "frac_reward_zero_std": 0.0, "grad_norm": 0.11529307523180332, "kl": 0.01556396484375, "learning_rate": 9.89424586108604e-07, "loss": 0.1118, "num_tokens": 485803289.0, "reward": 1.2767857313156128, "reward_std": 0.4165794551372528, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.31335172057151794, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1028.165283203125, "completions/mean_terminated_length": 757.361572265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.1628043258217463, "frac_reward_zero_std": 0.0, "grad_norm": 0.13693380628260118, "kl": 0.0191650390625, "learning_rate": 9.893523152344004e-07, "loss": 0.1402, "num_tokens": 486331811.0, "reward": 1.2349331378936768, "reward_std": 0.4471706449985504, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.3088892102241516, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1021.6339721679688, "completions/mean_terminated_length": 828.3395385742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16301742048905227, "frac_reward_zero_std": 0.0, "grad_norm": 0.13890666175283908, "kl": 0.01776123046875, "learning_rate": 9.892798012123195e-07, "loss": 0.0771, "num_tokens": 486859775.0, "reward": 1.3186384439468384, "reward_std": 0.3490552306175232, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8900669813156128, "rewards/tag_count_reward/std": 0.2366020530462265, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 995.9598388671875, "completions/mean_terminated_length": 817.4151611328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.16323051515635822, "frac_reward_zero_std": 0.0, "grad_norm": 0.13509418464534523, "kl": 0.018524169921875, "learning_rate": 9.892070440824929e-07, "loss": 0.0566, "num_tokens": 487382957.0, "reward": 1.403459906578064, "reward_std": 0.4495825171470642, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8677455186843872, "rewards/tag_count_reward/std": 0.2567932605743408, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1042.5379638671875, "completions/mean_terminated_length": 803.6713256835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16344360982366415, "frac_reward_zero_std": 0.0, "grad_norm": 0.1319257598959288, "kl": 0.01702880859375, "learning_rate": 9.891340438851858e-07, "loss": 0.148, "num_tokens": 487922574.0, "reward": 1.2890625, "reward_std": 0.43297436833381653, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8426339030265808, "rewards/tag_count_reward/std": 0.2883094549179077, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 960.2991333007812, "completions/mean_terminated_length": 801.7340087890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1636567044909701, "frac_reward_zero_std": 0.0, "grad_norm": 0.1376753544507727, "kl": 0.019317626953125, "learning_rate": 9.89060800660798e-07, "loss": 0.0594, "num_tokens": 488423508.0, "reward": 1.4090402126312256, "reward_std": 0.322277694940567, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8934151530265808, "rewards/tag_count_reward/std": 0.2381325215101242, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 921.419677734375, "completions/mean_terminated_length": 719.8211059570312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.16386979915827607, "frac_reward_zero_std": 0.0, "grad_norm": 0.1381497842949636, "kl": 0.020263671875, "learning_rate": 9.889873144498646e-07, "loss": 0.122, "num_tokens": 488903904.0, "reward": 1.4748884439468384, "reward_std": 0.366472065448761, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8677455186843872, "rewards/tag_count_reward/std": 0.26111283898353577, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1036.602783203125, "completions/mean_terminated_length": 760.76708984375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.16408289382558203, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13640681850162104, "kl": 0.017303466796875, "learning_rate": 9.889135852930541e-07, "loss": 0.1298, "num_tokens": 489432094.0, "reward": 1.422991156578064, "reward_std": 0.40439870953559875, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8805803656578064, "rewards/tag_count_reward/std": 0.2654026746749878, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 941.8504638671875, "completions/mean_terminated_length": 740.467041015625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.16429598849288796, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1743098149080204, "kl": 0.01934814453125, "learning_rate": 9.888396132311703e-07, "loss": 0.1406, "num_tokens": 489919787.0, "reward": 1.3889509439468384, "reward_std": 0.404680460691452, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8621651530265808, "rewards/tag_count_reward/std": 0.26829564571380615, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 956.029052734375, "completions/mean_terminated_length": 790.4087524414062, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.16450908316019391, "frac_reward_zero_std": 0.0, "grad_norm": 0.14199491995229113, "kl": 0.0177001953125, "learning_rate": 9.887653983051506e-07, "loss": 0.1274, "num_tokens": 490418024.0, "reward": 1.274553656578064, "reward_std": 0.3672487139701843, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8482142686843872, "rewards/tag_count_reward/std": 0.27089864015579224, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 992.7857666015625, "completions/mean_terminated_length": 790.723388671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16472217782749987, "frac_reward_zero_std": 0.0, "grad_norm": 0.14502102409347226, "kl": 0.018035888671875, "learning_rate": 9.886909405560675e-07, "loss": 0.1574, "num_tokens": 490931112.0, "reward": 1.3828126192092896, "reward_std": 0.41127461194992065, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8470982313156128, "rewards/tag_count_reward/std": 0.28536733984947205, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1010.4397583007812, "completions/mean_terminated_length": 831.1754150390625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.16493527249480583, "frac_reward_zero_std": 0.0, "grad_norm": 0.12931928876788112, "kl": 0.01715087890625, "learning_rate": 9.88616240025128e-07, "loss": 0.0495, "num_tokens": 491452253.0, "reward": 1.4598214626312256, "reward_std": 0.34742823243141174, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.49405214190483093, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8794642686843872, "rewards/tag_count_reward/std": 0.2524646520614624, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1137.77685546875, "completions/mean_terminated_length": 859.1370849609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.16514836716211176, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.3383580850687435, "kl": 0.0629730224609375, "learning_rate": 9.885412967536728e-07, "loss": 0.1246, "num_tokens": 492050601.0, "reward": 1.32421875, "reward_std": 0.4323054254055023, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8376116156578064, "rewards/tag_count_reward/std": 0.2910819947719574, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1064.915283203125, "completions/mean_terminated_length": 888.9947509765625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.16536146182941772, "frac_reward_zero_std": 0.0, "grad_norm": 0.12153622803148827, "kl": 0.0170745849609375, "learning_rate": 9.884661107831773e-07, "loss": 0.0593, "num_tokens": 492599155.0, "reward": 1.368303656578064, "reward_std": 0.3781953752040863, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8727678656578064, "rewards/tag_count_reward/std": 0.2590542435646057, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1010.1473388671875, "completions/mean_terminated_length": 798.1129150390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16557455649672367, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.18556961100941505, "kl": 0.019195556640625, "learning_rate": 9.883906821552514e-07, "loss": 0.0872, "num_tokens": 493115829.0, "reward": 1.403459906578064, "reward_std": 0.4156063497066498, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8900669813156128, "rewards/tag_count_reward/std": 0.25808316469192505, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1048.727783203125, "completions/mean_terminated_length": 903.0537109375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.16578765116402963, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12083756949635982, "kl": 0.016815185546875, "learning_rate": 9.883150109116386e-07, "loss": 0.1142, "num_tokens": 493653483.0, "reward": 1.383928656578064, "reward_std": 0.3617124855518341, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.23851032555103302, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1033.154052734375, "completions/mean_terminated_length": 815.8834838867188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16600074583133556, "frac_reward_zero_std": 0.0, "grad_norm": 0.13837038557885342, "kl": 0.01910400390625, "learning_rate": 9.882390970942176e-07, "loss": 0.1002, "num_tokens": 494182064.0, "reward": 1.4257813692092896, "reward_std": 0.43116581439971924, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8722098469734192, "rewards/tag_count_reward/std": 0.2659730017185211, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1079.94873046875, "completions/mean_terminated_length": 875.8729858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16621384049864152, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13287125394033297, "kl": 0.01800537109375, "learning_rate": 9.881629407450006e-07, "loss": 0.073, "num_tokens": 494744057.0, "reward": 1.4001116752624512, "reward_std": 0.32378655672073364, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8800223469734192, "rewards/tag_count_reward/std": 0.25189971923828125, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1082.265625, "completions/mean_terminated_length": 865.8988647460938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.16642693516594748, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12037159581499403, "kl": 0.0162506103515625, "learning_rate": 9.880865419061344e-07, "loss": 0.1045, "num_tokens": 495304544.0, "reward": 1.2840402126312256, "reward_std": 0.4162614941596985, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8643973469734192, "rewards/tag_count_reward/std": 0.2771081030368805, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1050.140625, "completions/mean_terminated_length": 836.5067749023438, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.16664002983325343, "frac_reward_zero_std": 0.0, "grad_norm": 0.2314128222723499, "kl": 0.01947021484375, "learning_rate": 9.880099006198998e-07, "loss": 0.0892, "num_tokens": 495848303.0, "reward": 1.4006696939468384, "reward_std": 0.3899260461330414, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2321632206439972, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1118.55810546875, "completions/mean_terminated_length": 778.5182495117188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16685312450055936, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11867931289512722, "kl": 0.0167999267578125, "learning_rate": 9.879330169287121e-07, "loss": 0.08, "num_tokens": 496419369.0, "reward": 1.3141741752624512, "reward_std": 0.3667290508747101, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8699776530265808, "rewards/tag_count_reward/std": 0.2659401297569275, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1125.1429443359375, "completions/mean_terminated_length": 853.086669921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.16706621916786532, "frac_reward_zero_std": 0.0, "grad_norm": 0.12625275680782078, "kl": 0.01666259765625, "learning_rate": 9.878558908751205e-07, "loss": 0.1155, "num_tokens": 497000329.0, "reward": 1.3543527126312256, "reward_std": 0.4074389636516571, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8364955186843872, "rewards/tag_count_reward/std": 0.30225032567977905, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1078.5223388671875, "completions/mean_terminated_length": 861.31689453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.16727931383517128, "frac_reward_zero_std": 0.0, "grad_norm": 0.29144328051957286, "kl": 0.0318603515625, "learning_rate": 9.877785225018085e-07, "loss": 0.1253, "num_tokens": 497561491.0, "reward": 1.2767857313156128, "reward_std": 0.44960084557533264, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8683035969734192, "rewards/tag_count_reward/std": 0.27265334129333496, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1149.19873046875, "completions/mean_terminated_length": 820.368896484375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.16749240850247724, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11643618902357854, "kl": 0.017364501953125, "learning_rate": 9.87700911851593e-07, "loss": 0.0868, "num_tokens": 498145836.0, "reward": 1.3599331378936768, "reward_std": 0.42775994539260864, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8175223469734192, "rewards/tag_count_reward/std": 0.31514689326286316, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1270.4754638671875, "completions/mean_terminated_length": 949.1640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16770550316978317, "frac_reward_zero_std": 0.0, "grad_norm": 0.1158507526117939, "kl": 0.013671875, "learning_rate": 9.87623058967426e-07, "loss": 0.0857, "num_tokens": 498782209.0, "reward": 1.1891741752624512, "reward_std": 0.4404595196247101, "rewards/accuracy_reward/mean": 0.3504464328289032, "rewards/accuracy_reward/std": 0.47764310240745544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8387276530265808, "rewards/tag_count_reward/std": 0.29598572850227356, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1185.7054443359375, "completions/mean_terminated_length": 891.3892822265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.16791859783708912, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11619686081921426, "kl": 0.0176239013671875, "learning_rate": 9.875449638923935e-07, "loss": 0.0715, "num_tokens": 499383533.0, "reward": 1.2818081378936768, "reward_std": 0.372024804353714, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8911830186843872, "rewards/tag_count_reward/std": 0.25692933797836304, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1139.7366943359375, "completions/mean_terminated_length": 875.3717041015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.16813169250439508, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13274790507710013, "kl": 0.015838623046875, "learning_rate": 9.874666266697141e-07, "loss": 0.0824, "num_tokens": 499968391.0, "reward": 1.407366156578064, "reward_std": 0.3396851718425751, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8962053656578064, "rewards/tag_count_reward/std": 0.24881619215011597, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1086.1317138671875, "completions/mean_terminated_length": 895.8155517578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16834478717170104, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13580965646497772, "kl": 0.0187225341796875, "learning_rate": 9.873880473427424e-07, "loss": 0.1234, "num_tokens": 500525442.0, "reward": 1.3465402126312256, "reward_std": 0.3806202709674835, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8934151530265808, "rewards/tag_count_reward/std": 0.2398875206708908, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1015.825927734375, "completions/mean_terminated_length": 804.9515991210938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16855788183900697, "frac_reward_zero_std": 0.0, "grad_norm": 0.13901525481969168, "kl": 0.01849365234375, "learning_rate": 9.873092259549657e-07, "loss": 0.1408, "num_tokens": 501047876.0, "reward": 1.3850446939468384, "reward_std": 0.4454170763492584, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8694196343421936, "rewards/tag_count_reward/std": 0.274212509393692, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1069.529052734375, "completions/mean_terminated_length": 860.0460815429688, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.16877097650631293, "frac_reward_zero_std": 0.0, "grad_norm": 0.11783784715946118, "kl": 0.01739501953125, "learning_rate": 9.872301625500054e-07, "loss": 0.0678, "num_tokens": 501595665.0, "reward": 1.4441964626312256, "reward_std": 0.3454887568950653, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8928571343421936, "rewards/tag_count_reward/std": 0.24682386219501495, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1114.102783203125, "completions/mean_terminated_length": 810.171630859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16898407117361888, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11269875261426066, "kl": 0.0170440673828125, "learning_rate": 9.871508571716173e-07, "loss": 0.0436, "num_tokens": 502160975.0, "reward": 1.3030134439468384, "reward_std": 0.39404064416885376, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.21482694149017334, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1000.1897583007812, "completions/mean_terminated_length": 758.3873901367188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16919716584092484, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1316779247178525, "kl": 0.020599365234375, "learning_rate": 9.870713098636912e-07, "loss": 0.0759, "num_tokens": 502678196.0, "reward": 1.4335938692092896, "reward_std": 0.36590439081192017, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2757668197154999, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1012.4085083007812, "completions/mean_terminated_length": 849.1757202148438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.16941026050823077, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1280866594894134, "kl": 0.01824951171875, "learning_rate": 9.869915206702495e-07, "loss": 0.0971, "num_tokens": 503198859.0, "reward": 1.4944196939468384, "reward_std": 0.41933438181877136, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8939732313156128, "rewards/tag_count_reward/std": 0.24274127185344696, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 938.8928833007812, "completions/mean_terminated_length": 780.448974609375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.16962335517553673, "frac_reward_zero_std": 0.0, "grad_norm": 0.13960112741214545, "kl": 0.019683837890625, "learning_rate": 9.869114896354501e-07, "loss": 0.1859, "num_tokens": 503683099.0, "reward": 1.4419643878936768, "reward_std": 0.37365370988845825, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.23264934122562408, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 954.810302734375, "completions/mean_terminated_length": 795.4450073242188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.16983644984284268, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1416180253116865, "kl": 0.0211181640625, "learning_rate": 9.868312168035841e-07, "loss": 0.0721, "num_tokens": 504180086.0, "reward": 1.5178571939468384, "reward_std": 0.3867291510105133, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.22558781504631042, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1058.665283203125, "completions/mean_terminated_length": 820.2382202148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17004954451014864, "frac_reward_zero_std": 0.0, "grad_norm": 0.12907451550579208, "kl": 0.01727294921875, "learning_rate": 9.86750702219076e-07, "loss": 0.1217, "num_tokens": 504728944.0, "reward": 1.31640625, "reward_std": 0.3998180329799652, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8699776530265808, "rewards/tag_count_reward/std": 0.2685560882091522, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 982.15185546875, "completions/mean_terminated_length": 814.14990234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17026263917745457, "frac_reward_zero_std": 0.0, "grad_norm": 0.14113066617204376, "kl": 0.01953125, "learning_rate": 9.866699459264846e-07, "loss": 0.0963, "num_tokens": 505241796.0, "reward": 1.5066964626312256, "reward_std": 0.36500248312950134, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8995535969734192, "rewards/tag_count_reward/std": 0.23521094024181366, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1133.5357666015625, "completions/mean_terminated_length": 910.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17047573384476053, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11182911996434776, "kl": 0.0133514404296875, "learning_rate": 9.865889479705027e-07, "loss": 0.1273, "num_tokens": 505825028.0, "reward": 1.3649554252624512, "reward_std": 0.44971877336502075, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24096561968326569, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1078.962158203125, "completions/mean_terminated_length": 868.3016357421875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1706888285120665, "frac_reward_zero_std": 0.0, "grad_norm": 0.13245310528949117, "kl": 0.0164947509765625, "learning_rate": 9.865077083959557e-07, "loss": 0.098, "num_tokens": 506379731.0, "reward": 1.321428656578064, "reward_std": 0.40782609581947327, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8772321343421936, "rewards/tag_count_reward/std": 0.2696329653263092, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1006.85498046875, "completions/mean_terminated_length": 807.4866943359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17090192317937244, "frac_reward_zero_std": 0.0, "grad_norm": 0.13758957582685366, "kl": 0.017608642578125, "learning_rate": 9.864262272478043e-07, "loss": 0.058, "num_tokens": 506896114.0, "reward": 1.4614956378936768, "reward_std": 0.3501630127429962, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.22752127051353455, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1085.680908203125, "completions/mean_terminated_length": 866.85205078125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.17111501784667837, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15229395075382232, "kl": 0.017822265625, "learning_rate": 9.863445045711415e-07, "loss": 0.1377, "num_tokens": 507452883.0, "reward": 1.3119419813156128, "reward_std": 0.367929607629776, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.26011165976524353, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1033.9554443359375, "completions/mean_terminated_length": 826.7849731445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17132811251398433, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12760782686145986, "kl": 0.018646240234375, "learning_rate": 9.862625404111947e-07, "loss": 0.1117, "num_tokens": 507987983.0, "reward": 1.3950893878936768, "reward_std": 0.3463027775287628, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8995535969734192, "rewards/tag_count_reward/std": 0.24281583726406097, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1096.078125, "completions/mean_terminated_length": 829.5399780273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1715412071812903, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12176998301862253, "kl": 0.017364501953125, "learning_rate": 9.861803348133248e-07, "loss": 0.091, "num_tokens": 508549026.0, "reward": 1.3844866752624512, "reward_std": 0.3844606578350067, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8643973469734192, "rewards/tag_count_reward/std": 0.2735532820224762, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1027.65185546875, "completions/mean_terminated_length": 785.2486572265625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.17175430184859625, "frac_reward_zero_std": 0.0, "grad_norm": 0.15780490669581076, "kl": 0.019256591796875, "learning_rate": 9.860978878230266e-07, "loss": 0.0596, "num_tokens": 509084406.0, "reward": 1.2957589626312256, "reward_std": 0.3426037132740021, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.24278241395950317, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1014.5335083007812, "completions/mean_terminated_length": 842.2890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17196739651590218, "frac_reward_zero_std": 0.0, "grad_norm": 0.13828243637365842, "kl": 0.01934814453125, "learning_rate": 9.860151994859277e-07, "loss": 0.0881, "num_tokens": 509607797.0, "reward": 1.426897406578064, "reward_std": 0.4110271632671356, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8889508843421936, "rewards/tag_count_reward/std": 0.24707597494125366, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 961.77685546875, "completions/mean_terminated_length": 780.7396240234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17218049118320813, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14546094836819282, "kl": 0.0201416015625, "learning_rate": 9.8593226984779e-07, "loss": 0.0453, "num_tokens": 510103521.0, "reward": 1.383928656578064, "reward_std": 0.25870847702026367, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9084821343421936, "rewards/tag_count_reward/std": 0.22309480607509613, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 910.4241333007812, "completions/mean_terminated_length": 767.5125732421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1723935858505141, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13986036976829633, "kl": 0.02178955078125, "learning_rate": 9.85849098954509e-07, "loss": 0.0504, "num_tokens": 510573743.0, "reward": 1.5876116752624512, "reward_std": 0.3486694395542145, "rewards/accuracy_reward/mean": 0.6830357313156128, "rewards/accuracy_reward/std": 0.4658135175704956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9045758843421936, "rewards/tag_count_reward/std": 0.2166612446308136, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1004.6295166015625, "completions/mean_terminated_length": 827.55615234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17260668051782005, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14373156835951706, "kl": 0.02008056640625, "learning_rate": 9.857656868521128e-07, "loss": 0.0967, "num_tokens": 511092233.0, "reward": 1.3258929252624512, "reward_std": 0.29462239146232605, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9084821343421936, "rewards/tag_count_reward/std": 0.21673686802387238, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 983.19873046875, "completions/mean_terminated_length": 808.9584350585938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.17281977518512598, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14363248711857868, "kl": 0.019317626953125, "learning_rate": 9.85682033586764e-07, "loss": 0.1254, "num_tokens": 511609842.0, "reward": 1.4547991752624512, "reward_std": 0.3784177005290985, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8922991156578064, "rewards/tag_count_reward/std": 0.24055272340774536, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1027.37060546875, "completions/mean_terminated_length": 763.6123657226562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.17303286985243194, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15304008254762147, "kl": 0.01873779296875, "learning_rate": 9.855981392047582e-07, "loss": 0.0761, "num_tokens": 512141128.0, "reward": 1.3560268878936768, "reward_std": 0.34131333231925964, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.27262356877326965, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1077.279052734375, "completions/mean_terminated_length": 885.2112426757812, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1732459645197379, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1255546197262071, "kl": 0.018829345703125, "learning_rate": 9.855140037525246e-07, "loss": 0.105, "num_tokens": 512706277.0, "reward": 1.4810268878936768, "reward_std": 0.38749974966049194, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.4921026825904846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8895089030265808, "rewards/tag_count_reward/std": 0.2564849555492401, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 938.69873046875, "completions/mean_terminated_length": 726.2792358398438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.17345905918704385, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13838736473928231, "kl": 0.01861572265625, "learning_rate": 9.854296272766258e-07, "loss": 0.0565, "num_tokens": 513191582.0, "reward": 1.4988839626312256, "reward_std": 0.3418709635734558, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.22692033648490906, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1000.1250610351562, "completions/mean_terminated_length": 786.0430297851562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17367215385434978, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12841674594571809, "kl": 0.018890380859375, "learning_rate": 9.853450098237576e-07, "loss": 0.0961, "num_tokens": 513711494.0, "reward": 1.3811384439468384, "reward_std": 0.3607409596443176, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.2191363126039505, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1043.165283203125, "completions/mean_terminated_length": 824.7228393554688, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.17388524852165574, "frac_reward_zero_std": 0.0, "grad_norm": 0.12796800266321556, "kl": 0.018310546875, "learning_rate": 9.852601514407492e-07, "loss": 0.0883, "num_tokens": 514241472.0, "reward": 1.4910714626312256, "reward_std": 0.4593278467655182, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.2511458992958069, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 817.3013916015625, "completions/mean_terminated_length": 709.7645874023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1740983431889617, "frac_reward_zero_std": 0.0, "grad_norm": 0.22187650620338542, "kl": 0.0279541015625, "learning_rate": 9.851750521745631e-07, "loss": 0.0593, "num_tokens": 514675687.0, "reward": 1.5446429252624512, "reward_std": 0.2849263548851013, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.17457373440265656, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 935.9710083007812, "completions/mean_terminated_length": 760.68994140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.17431143785626765, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1455488867155865, "kl": 0.0235595703125, "learning_rate": 9.850897120722958e-07, "loss": 0.0711, "num_tokens": 515163258.0, "reward": 1.5641741752624512, "reward_std": 0.35682252049446106, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.21243098378181458, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 983.1428833007812, "completions/mean_terminated_length": 795.884521484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1745245325235736, "frac_reward_zero_std": 0.0, "grad_norm": 0.13117668369356553, "kl": 0.018157958984375, "learning_rate": 9.85004131181176e-07, "loss": 0.102, "num_tokens": 515667386.0, "reward": 1.4726563692092896, "reward_std": 0.40249624848365784, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8989955186843872, "rewards/tag_count_reward/std": 0.24229536950588226, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1097.5335693359375, "completions/mean_terminated_length": 795.62060546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.17473762719087954, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.9049417323035682, "kl": 0.0238037109375, "learning_rate": 9.849183095485662e-07, "loss": 0.1036, "num_tokens": 516232585.0, "reward": 1.3510044813156128, "reward_std": 0.4079856276512146, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8934151530265808, "rewards/tag_count_reward/std": 0.24393343925476074, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1068.0067138671875, "completions/mean_terminated_length": 870.9571533203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1749507218581855, "frac_reward_zero_std": 0.0, "grad_norm": 0.12649692216588773, "kl": 0.017791748046875, "learning_rate": 9.848322472219625e-07, "loss": 0.0822, "num_tokens": 516787052.0, "reward": 1.3805804252624512, "reward_std": 0.36983272433280945, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.25340983271598816, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 934.1607666015625, "completions/mean_terminated_length": 824.9608154296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17516381652549146, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13396193991790273, "kl": 0.01947021484375, "learning_rate": 9.847459442489933e-07, "loss": 0.0602, "num_tokens": 517267780.0, "reward": 1.6216518878936768, "reward_std": 0.2967464029788971, "rewards/accuracy_reward/mean": 0.7165178656578064, "rewards/accuracy_reward/std": 0.4511922299861908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2235727310180664, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 981.5491333007812, "completions/mean_terminated_length": 838.4556884765625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1753769111927974, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13399583815501598, "kl": 0.019439697265625, "learning_rate": 9.846594006774207e-07, "loss": 0.1079, "num_tokens": 517771258.0, "reward": 1.4704241752624512, "reward_std": 0.3607228398323059, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8967633843421936, "rewards/tag_count_reward/std": 0.24018916487693787, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1054.446533203125, "completions/mean_terminated_length": 841.7344360351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17559000586010334, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.129341305739734, "kl": 0.017364501953125, "learning_rate": 9.845726165551406e-07, "loss": 0.119, "num_tokens": 518315250.0, "reward": 1.3973214626312256, "reward_std": 0.4071505665779114, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.26121383905410767, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1204.93310546875, "completions/mean_terminated_length": 975.0057373046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1758031005274093, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12304783085405822, "kl": 0.016998291015625, "learning_rate": 9.844855919301809e-07, "loss": 0.1033, "num_tokens": 518922836.0, "reward": 1.2678571939468384, "reward_std": 0.403646320104599, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8816964030265808, "rewards/tag_count_reward/std": 0.26747602224349976, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 996.0870971679688, "completions/mean_terminated_length": 763.9209594726562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17601619519471526, "frac_reward_zero_std": 0.0, "grad_norm": 0.15660129548662077, "kl": 0.01922607421875, "learning_rate": 9.843983268507028e-07, "loss": 0.0898, "num_tokens": 519440171.0, "reward": 1.3560268878936768, "reward_std": 0.35104814171791077, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8917410969734192, "rewards/tag_count_reward/std": 0.24290579557418823, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 934.3616333007812, "completions/mean_terminated_length": 794.457275390625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.17622928986202122, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1406009542567803, "kl": 0.01898193359375, "learning_rate": 9.843108213650013e-07, "loss": 0.0998, "num_tokens": 519924813.0, "reward": 1.3515626192092896, "reward_std": 0.31163233518600464, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.2222510427236557, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1191.16748046875, "completions/mean_terminated_length": 944.951171875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.17644238452932715, "frac_reward_zero_std": 0.0, "grad_norm": 0.11624911417911378, "kl": 0.0152435302734375, "learning_rate": 9.84223075521504e-07, "loss": 0.0562, "num_tokens": 520531000.0, "reward": 1.28125, "reward_std": 0.3583948314189911, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.24815554916858673, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1110.477783203125, "completions/mean_terminated_length": 834.0982666015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1766554791966331, "frac_reward_zero_std": 0.0, "grad_norm": 0.13957798308123592, "kl": 0.01776123046875, "learning_rate": 9.841350893687712e-07, "loss": 0.0982, "num_tokens": 521097742.0, "reward": 1.2840402126312256, "reward_std": 0.4644352197647095, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8532366156578064, "rewards/tag_count_reward/std": 0.2878517210483551, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1108.9910888671875, "completions/mean_terminated_length": 828.6492919921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17686857386393906, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3062296208053762, "kl": 0.03692626953125, "learning_rate": 9.840468629554968e-07, "loss": 0.0827, "num_tokens": 521661482.0, "reward": 1.4006696939468384, "reward_std": 0.3762128949165344, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.26315414905548096, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1021.2656860351562, "completions/mean_terminated_length": 868.5718383789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17708166853124502, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12098574812589716, "kl": 0.0179443359375, "learning_rate": 9.839583963305068e-07, "loss": 0.1055, "num_tokens": 522187905.0, "reward": 1.5496652126312256, "reward_std": 0.3815498650074005, "rewards/accuracy_reward/mean": 0.6696428656578064, "rewards/accuracy_reward/std": 0.47086748480796814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8800223469734192, "rewards/tag_count_reward/std": 0.26009246706962585, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1015.7500610351562, "completions/mean_terminated_length": 811.508056640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.17729476319855095, "frac_reward_zero_std": 0.0, "grad_norm": 0.12536385488903412, "kl": 0.0174560546875, "learning_rate": 9.838696895427614e-07, "loss": 0.0773, "num_tokens": 522710913.0, "reward": 1.4642857313156128, "reward_std": 0.3339889347553253, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.23197223246097565, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 986.8147583007812, "completions/mean_terminated_length": 796.91845703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1775078578658569, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14978774051107172, "kl": 0.019805908203125, "learning_rate": 9.837807426413526e-07, "loss": 0.1088, "num_tokens": 523225534.0, "reward": 1.5290179252624512, "reward_std": 0.3605847954750061, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.23015686869621277, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1119.1116943359375, "completions/mean_terminated_length": 892.050048828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17772095253316286, "frac_reward_zero_std": 0.0, "grad_norm": 0.13096546853962232, "kl": 0.01715087890625, "learning_rate": 9.83691555675506e-07, "loss": 0.1229, "num_tokens": 523797680.0, "reward": 1.4017857313156128, "reward_std": 0.440616637468338, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.23955488204956055, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1011.8638916015625, "completions/mean_terminated_length": 842.3142700195312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.17793404720046882, "frac_reward_zero_std": 0.0, "grad_norm": 0.1429841387124913, "kl": 0.0174560546875, "learning_rate": 9.836021286945794e-07, "loss": 0.1207, "num_tokens": 524317875.0, "reward": 1.3470982313156128, "reward_std": 0.33667925000190735, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.49717557430267334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2309555560350418, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1039.743408203125, "completions/mean_terminated_length": 849.859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17814714186777475, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1265815492536423, "kl": 0.019500732421875, "learning_rate": 9.835124617480643e-07, "loss": 0.1154, "num_tokens": 524855072.0, "reward": 1.3978794813156128, "reward_std": 0.3377029001712799, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.2326003611087799, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1016.7879638671875, "completions/mean_terminated_length": 778.8159790039062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1783602365350807, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15091653024462656, "kl": 0.01971435546875, "learning_rate": 9.834225548855838e-07, "loss": 0.1042, "num_tokens": 525373729.0, "reward": 1.4793527126312256, "reward_std": 0.3538217842578888, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8856026530265808, "rewards/tag_count_reward/std": 0.2544882893562317, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1138.060302734375, "completions/mean_terminated_length": 918.7672729492188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.17857333120238666, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11816602252606195, "kl": 0.017120361328125, "learning_rate": 9.83332408156895e-07, "loss": 0.0638, "num_tokens": 525955596.0, "reward": 1.477678656578064, "reward_std": 0.39377862215042114, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23201528191566467, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1125.7054443359375, "completions/mean_terminated_length": 853.8150024414062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17878642586969262, "frac_reward_zero_std": 0.0, "grad_norm": 0.14082054647841116, "kl": 0.01812744140625, "learning_rate": 9.832420216118871e-07, "loss": 0.1132, "num_tokens": 526534872.0, "reward": 1.2444196939468384, "reward_std": 0.41731616854667664, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.28232377767562866, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1134.384033203125, "completions/mean_terminated_length": 904.703857421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17899952053699855, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1341890070250784, "kl": 0.020111083984375, "learning_rate": 9.83151395300582e-07, "loss": 0.0891, "num_tokens": 527115316.0, "reward": 1.4285714626312256, "reward_std": 0.31156718730926514, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.231843039393425, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1100.265625, "completions/mean_terminated_length": 909.7024536132812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.1792126152043045, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11540806408311374, "kl": 0.017669677734375, "learning_rate": 9.830605292731347e-07, "loss": 0.0534, "num_tokens": 527673387.0, "reward": 1.4469866752624512, "reward_std": 0.3376753032207489, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.21483854949474335, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1135.040283203125, "completions/mean_terminated_length": 908.70751953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17942570987161047, "frac_reward_zero_std": 0.0, "grad_norm": 0.12256690693632091, "kl": 0.017242431640625, "learning_rate": 9.829694235798323e-07, "loss": 0.0964, "num_tokens": 528255453.0, "reward": 1.4592634439468384, "reward_std": 0.4008312523365021, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2595735788345337, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1093.1295166015625, "completions/mean_terminated_length": 815.1988525390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17963880453891642, "frac_reward_zero_std": 0.0, "grad_norm": 0.39184714419617994, "kl": 0.02716064453125, "learning_rate": 9.828780782710948e-07, "loss": 0.0838, "num_tokens": 528811655.0, "reward": 1.3030134439468384, "reward_std": 0.37335294485092163, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2517460286617279, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1103.96435546875, "completions/mean_terminated_length": 839.63427734375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.17985189920622235, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12563362753294519, "kl": 0.018310546875, "learning_rate": 9.827864933974753e-07, "loss": 0.0906, "num_tokens": 529381671.0, "reward": 1.28125, "reward_std": 0.3379424214363098, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8816964030265808, "rewards/tag_count_reward/std": 0.253520667552948, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1088.3013916015625, "completions/mean_terminated_length": 794.5160522460938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1800649938735283, "frac_reward_zero_std": 0.0, "grad_norm": 0.14116412359340935, "kl": 0.01715087890625, "learning_rate": 9.826946690096583e-07, "loss": 0.1054, "num_tokens": 529936574.0, "reward": 1.454241156578064, "reward_std": 0.3952125012874603, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.26002347469329834, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1049.4754638671875, "completions/mean_terminated_length": 838.9757080078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18027808854083427, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1323145174629971, "kl": 0.018157958984375, "learning_rate": 9.826026051584622e-07, "loss": 0.1141, "num_tokens": 530469683.0, "reward": 1.4135044813156128, "reward_std": 0.3943188786506653, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8577008843421936, "rewards/tag_count_reward/std": 0.2832612991333008, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1028.622802734375, "completions/mean_terminated_length": 846.2079467773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.18049118320814023, "frac_reward_zero_std": 0.0, "grad_norm": 0.1390860132543264, "kl": 0.0186767578125, "learning_rate": 9.825103018948368e-07, "loss": 0.0531, "num_tokens": 530996042.0, "reward": 1.4207589626312256, "reward_std": 0.32364344596862793, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390644073486, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.21276935935020447, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1062.493408203125, "completions/mean_terminated_length": 892.2225341796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18070427787544616, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12394130330994002, "kl": 0.01898193359375, "learning_rate": 9.824177592698654e-07, "loss": 0.0466, "num_tokens": 531539255.0, "reward": 1.4720982313156128, "reward_std": 0.3249511122703552, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.22599419951438904, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1134.888427734375, "completions/mean_terminated_length": 812.1268920898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18091737254275211, "frac_reward_zero_std": 0.0, "grad_norm": 0.1324424655918439, "kl": 0.015594482421875, "learning_rate": 9.823249773347629e-07, "loss": 0.0989, "num_tokens": 532122085.0, "reward": 1.2410714626312256, "reward_std": 0.34604308009147644, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.23792338371276855, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 988.3214721679688, "completions/mean_terminated_length": 827.5989990234375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18113046721005807, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14527627770587262, "kl": 0.019622802734375, "learning_rate": 9.822319561408772e-07, "loss": 0.0967, "num_tokens": 532632341.0, "reward": 1.512834906578064, "reward_std": 0.3208707869052887, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.18335527181625366, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 928.04248046875, "completions/mean_terminated_length": 771.3053588867188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.18134356187736403, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.17837806822699176, "kl": 0.02435302734375, "learning_rate": 9.821386957396882e-07, "loss": 0.0942, "num_tokens": 533118312.0, "reward": 1.5580357313156128, "reward_std": 0.31677085161209106, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.21902872622013092, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1080.390625, "completions/mean_terminated_length": 876.4081420898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.18155665654466996, "frac_reward_zero_std": 0.0, "grad_norm": 0.14599819769459818, "kl": 0.018035888671875, "learning_rate": 9.820451961828085e-07, "loss": 0.1153, "num_tokens": 533675303.0, "reward": 1.3989956378936768, "reward_std": 0.37759333848953247, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2138717770576477, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1081.825927734375, "completions/mean_terminated_length": 855.5867919921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18176975121197592, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14042897147515412, "kl": 0.0163726806640625, "learning_rate": 9.81951457521983e-07, "loss": 0.0989, "num_tokens": 534226873.0, "reward": 1.368303656578064, "reward_std": 0.3966476023197174, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8794642686843872, "rewards/tag_count_reward/std": 0.26171037554740906, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1140.2545166015625, "completions/mean_terminated_length": 905.6685791015625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.18198284587928187, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287666765472608, "kl": 0.0171966552734375, "learning_rate": 9.81857479809089e-07, "loss": 0.07, "num_tokens": 534814299.0, "reward": 1.3844866752624512, "reward_std": 0.38776344060897827, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9068080186843872, "rewards/tag_count_reward/std": 0.23612141609191895, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1116.2567138671875, "completions/mean_terminated_length": 848.5143432617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18219594054658783, "frac_reward_zero_std": 0.0, "grad_norm": 0.13093682328142742, "kl": 0.016754150390625, "learning_rate": 9.817632630961354e-07, "loss": 0.1137, "num_tokens": 535377678.0, "reward": 1.3872768878936768, "reward_std": 0.3309180438518524, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8805803656578064, "rewards/tag_count_reward/std": 0.2584632337093353, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1108.6317138671875, "completions/mean_terminated_length": 891.8544311523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18240903521389376, "frac_reward_zero_std": 0.0, "grad_norm": 0.12030697921826104, "kl": 0.01666259765625, "learning_rate": 9.816688074352645e-07, "loss": 0.1062, "num_tokens": 535941961.0, "reward": 1.3247768878936768, "reward_std": 0.36762282252311707, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24038465321063995, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1039.4107666015625, "completions/mean_terminated_length": 846.2765502929688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18262212988119972, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12695347725929787, "kl": 0.01788330078125, "learning_rate": 9.815741128787503e-07, "loss": 0.1243, "num_tokens": 536472001.0, "reward": 1.4157366752624512, "reward_std": 0.3780704736709595, "rewards/accuracy_reward/mean": 0.5532407164573669, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8822544813156128, "rewards/tag_count_reward/std": 0.25733718276023865, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1013.388427734375, "completions/mean_terminated_length": 818.5410766601562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18283522454850568, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.5118013329070157, "kl": 0.05303955078125, "learning_rate": 9.814791794789986e-07, "loss": 0.0213, "num_tokens": 536994319.0, "reward": 1.4045759439468384, "reward_std": 0.28441864252090454, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.2086479365825653, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1102.946533203125, "completions/mean_terminated_length": 841.7777709960938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.18304831921581163, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13643183175576157, "kl": 0.019287109375, "learning_rate": 9.81384007288548e-07, "loss": 0.0525, "num_tokens": 537554583.0, "reward": 1.469866156578064, "reward_std": 0.3209141492843628, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.21987920999526978, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 973.8013916015625, "completions/mean_terminated_length": 820.3443603515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18326141388311756, "frac_reward_zero_std": 0.0, "grad_norm": 0.1365111540687456, "kl": 0.018890380859375, "learning_rate": 9.81288596360069e-07, "loss": 0.0998, "num_tokens": 538061934.0, "reward": 1.505022406578064, "reward_std": 0.4115324914455414, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21634988486766815, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1032.868408203125, "completions/mean_terminated_length": 808.8201293945312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.18347450855042352, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.135029607527849, "kl": 0.01715087890625, "learning_rate": 9.811929467463644e-07, "loss": 0.0942, "num_tokens": 538594979.0, "reward": 1.501116156578064, "reward_std": 0.38599494099617004, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8939732313156128, "rewards/tag_count_reward/std": 0.25673794746398926, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1087.6875, "completions/mean_terminated_length": 856.2548217773438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.18368760321772948, "frac_reward_zero_std": 0.0, "grad_norm": 0.13104428429302495, "kl": 0.01885986328125, "learning_rate": 9.810970585003686e-07, "loss": 0.0744, "num_tokens": 539146199.0, "reward": 1.3476563692092896, "reward_std": 0.3571614623069763, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8900669813156128, "rewards/tag_count_reward/std": 0.24870266020298004, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1120.85498046875, "completions/mean_terminated_length": 874.663818359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18390069788503544, "frac_reward_zero_std": 0.0, "grad_norm": 0.12790363551308906, "kl": 0.01629638671875, "learning_rate": 9.810009316751487e-07, "loss": 0.1049, "num_tokens": 539712694.0, "reward": 1.3102679252624512, "reward_std": 0.40214911103248596, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.24430227279663086, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1103.1116943359375, "completions/mean_terminated_length": 888.24658203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.18411379255234137, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11861042092319073, "kl": 0.0177001953125, "learning_rate": 9.809045663239033e-07, "loss": 0.0826, "num_tokens": 540280600.0, "reward": 1.2678571939468384, "reward_std": 0.35042956471443176, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8883928656578064, "rewards/tag_count_reward/std": 0.25380611419677734, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1117.5960693359375, "completions/mean_terminated_length": 843.3150024414062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.18432688721964732, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1408078647826975, "kl": 0.01519775390625, "learning_rate": 9.808079624999634e-07, "loss": 0.0936, "num_tokens": 540852659.0, "reward": 1.3186384439468384, "reward_std": 0.37147876620292664, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.23275060951709747, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1048.404052734375, "completions/mean_terminated_length": 853.8159790039062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.18453998188695328, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1489224783461349, "kl": 0.018402099609375, "learning_rate": 9.807111202567919e-07, "loss": 0.0713, "num_tokens": 541389304.0, "reward": 1.4804688692092896, "reward_std": 0.29375186562538147, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.22353574633598328, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1069.7857666015625, "completions/mean_terminated_length": 885.5596923828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.18475307655425924, "frac_reward_zero_std": 0.0, "grad_norm": 0.12622876198319827, "kl": 0.0172882080078125, "learning_rate": 9.806140396479834e-07, "loss": 0.0742, "num_tokens": 541944744.0, "reward": 1.4202009439468384, "reward_std": 0.4015435576438904, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21197210252285004, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 994.6495971679688, "completions/mean_terminated_length": 786.232666015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18496617122156517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14702530850785664, "kl": 0.0198974609375, "learning_rate": 9.805167207272647e-07, "loss": 0.1048, "num_tokens": 542459355.0, "reward": 1.4988839626312256, "reward_std": 0.38961467146873474, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8917410969734192, "rewards/tag_count_reward/std": 0.24348074197769165, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1195.2076416015625, "completions/mean_terminated_length": 900.69970703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18517926588887113, "frac_reward_zero_std": 0.0, "grad_norm": 0.12416011425409289, "kl": 0.018829345703125, "learning_rate": 9.804191635484942e-07, "loss": 0.0916, "num_tokens": 543065592.0, "reward": 1.3364956378936768, "reward_std": 0.37175387144088745, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8989955186843872, "rewards/tag_count_reward/std": 0.23107106983661652, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1156.3773193359375, "completions/mean_terminated_length": 922.7971801757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18539236055617708, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1269630574523989, "kl": 0.0179443359375, "learning_rate": 9.803213681656627e-07, "loss": 0.0783, "num_tokens": 543658065.0, "reward": 1.3297991752624512, "reward_std": 0.35142168402671814, "rewards/accuracy_reward/mean": 0.43518519401550293, "rewards/accuracy_reward/std": 0.4963560700416565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23024296760559082, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1067.83935546875, "completions/mean_terminated_length": 857.9945678710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18560545522348304, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13005272770798165, "kl": 0.01690673828125, "learning_rate": 9.80223334632892e-07, "loss": 0.0857, "num_tokens": 544206473.0, "reward": 1.4743304252624512, "reward_std": 0.4461269676685333, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.23583374917507172, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 951.2545166015625, "completions/mean_terminated_length": 778.3824462890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18581854989078897, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14486847202469041, "kl": 0.0208740234375, "learning_rate": 9.801250630044362e-07, "loss": 0.0776, "num_tokens": 544700619.0, "reward": 1.462053656578064, "reward_std": 0.3894461393356323, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20557457208633423, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 943.1295166015625, "completions/mean_terminated_length": 788.5038452148438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18603164455809493, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12596267179736972, "kl": 0.0201416015625, "learning_rate": 9.800265533346816e-07, "loss": 0.0871, "num_tokens": 545190581.0, "reward": 1.5145089626312256, "reward_std": 0.2981497049331665, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18416117131710052, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1070.38623046875, "completions/mean_terminated_length": 867.4851684570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18624473922540088, "frac_reward_zero_std": 0.0, "grad_norm": 0.12821212235338597, "kl": 0.017913818359375, "learning_rate": 9.799278056781453e-07, "loss": 0.0893, "num_tokens": 545739970.0, "reward": 1.4218751192092896, "reward_std": 0.37049028277397156, "rewards/accuracy_reward/mean": 0.5462962985038757, "rewards/accuracy_reward/std": 0.49842923879623413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8950892686843872, "rewards/tag_count_reward/std": 0.24437379837036133, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1072.1473388671875, "completions/mean_terminated_length": 850.2410888671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18645783389270684, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1297002027899391, "kl": 0.017608642578125, "learning_rate": 9.798288200894768e-07, "loss": 0.0828, "num_tokens": 546282980.0, "reward": 1.4960938692092896, "reward_std": 0.3387187719345093, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21891970932483673, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 916.1875610351562, "completions/mean_terminated_length": 764.3240356445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18667092856001277, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12635301300063642, "kl": 0.01837158203125, "learning_rate": 9.79729596623457e-07, "loss": 0.0404, "num_tokens": 546763064.0, "reward": 1.5172991752624512, "reward_std": 0.3567131757736206, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.494759202003479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17069678008556366, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1070.44873046875, "completions/mean_terminated_length": 857.9375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.18688402322731873, "frac_reward_zero_std": 0.0, "grad_norm": 0.1408906467106944, "kl": 0.019439697265625, "learning_rate": 9.796301353349984e-07, "loss": 0.158, "num_tokens": 547308433.0, "reward": 1.2896206378936768, "reward_std": 0.41165655851364136, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8989955186843872, "rewards/tag_count_reward/std": 0.24630171060562134, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1160.93310546875, "completions/mean_terminated_length": 962.1912231445312, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1870971178946247, "frac_reward_zero_std": 0.0, "grad_norm": 0.11879042075744604, "kl": 0.016815185546875, "learning_rate": 9.795304362791454e-07, "loss": 0.0542, "num_tokens": 547902819.0, "reward": 1.4224331378936768, "reward_std": 0.40214285254478455, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8911830186843872, "rewards/tag_count_reward/std": 0.2394656240940094, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1106.8192138671875, "completions/mean_terminated_length": 836.3649291992188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.18731021256193064, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11689074486762376, "kl": 0.015960693359375, "learning_rate": 9.794304995110735e-07, "loss": 0.0869, "num_tokens": 548478498.0, "reward": 1.3314732313156128, "reward_std": 0.3834100663661957, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24327555298805237, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 940.9933471679688, "completions/mean_terminated_length": 721.9598999023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1875233072292366, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1790509872819405, "kl": 0.02001953125, "learning_rate": 9.793303250860904e-07, "loss": 0.1795, "num_tokens": 548972655.0, "reward": 1.4882813692092896, "reward_std": 0.3501301109790802, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.22040872275829315, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1010.1138916015625, "completions/mean_terminated_length": 781.0435791015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18773640189654253, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13705385111251822, "kl": 0.0194091796875, "learning_rate": 9.792299130596346e-07, "loss": 0.082, "num_tokens": 549487058.0, "reward": 1.4129464626312256, "reward_std": 0.3497004806995392, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.20586584508419037, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 967.5491333007812, "completions/mean_terminated_length": 819.4669799804688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1879494965638485, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.4321274233585489, "kl": 0.05194091796875, "learning_rate": 9.791292634872767e-07, "loss": 0.1322, "num_tokens": 549982408.0, "reward": 1.4748884439468384, "reward_std": 0.37381643056869507, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9034598469734192, "rewards/tag_count_reward/std": 0.22689488530158997, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1173.1473388671875, "completions/mean_terminated_length": 901.994140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18816259123115445, "frac_reward_zero_std": 0.0, "grad_norm": 0.1228861490247945, "kl": 0.01837158203125, "learning_rate": 9.790283764247187e-07, "loss": 0.0979, "num_tokens": 550570858.0, "reward": 1.3727679252624512, "reward_std": 0.39497649669647217, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8549107313156128, "rewards/tag_count_reward/std": 0.28797370195388794, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1064.638427734375, "completions/mean_terminated_length": 894.2099609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1883756858984604, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14522509556413055, "kl": 0.022125244140625, "learning_rate": 9.789272519277936e-07, "loss": 0.0957, "num_tokens": 551118744.0, "reward": 1.4983259439468384, "reward_std": 0.3826301693916321, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9045758843421936, "rewards/tag_count_reward/std": 0.23343613743782043, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1117.4710693359375, "completions/mean_terminated_length": 832.6151733398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18858878056576633, "frac_reward_zero_std": 0.0, "grad_norm": 0.1268958739566984, "kl": 0.0168304443359375, "learning_rate": 9.78825890052466e-07, "loss": 0.1229, "num_tokens": 551695339.0, "reward": 1.3353794813156128, "reward_std": 0.41855767369270325, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8666294813156128, "rewards/tag_count_reward/std": 0.27157923579216003, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1146.274658203125, "completions/mean_terminated_length": 873.6598510742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1888018752330723, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1197987332931469, "kl": 0.016204833984375, "learning_rate": 9.787242908548323e-07, "loss": 0.0902, "num_tokens": 552279366.0, "reward": 1.4246652126312256, "reward_std": 0.33839982748031616, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8755580186843872, "rewards/tag_count_reward/std": 0.26546087861061096, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1028.9285888671875, "completions/mean_terminated_length": 843.3984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18901496990037825, "frac_reward_zero_std": 0.0, "grad_norm": 0.14030611238550697, "kl": 0.018218994140625, "learning_rate": 9.786224543911195e-07, "loss": 0.1151, "num_tokens": 552812454.0, "reward": 1.5150669813156128, "reward_std": 0.40816301107406616, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8878348469734192, "rewards/tag_count_reward/std": 0.2384992092847824, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1050.1741943359375, "completions/mean_terminated_length": 816.5234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1892280645676842, "frac_reward_zero_std": 0.0, "grad_norm": 0.12866111395369745, "kl": 0.01824951171875, "learning_rate": 9.785203807176864e-07, "loss": 0.1062, "num_tokens": 553347540.0, "reward": 1.4676339626312256, "reward_std": 0.3957492411136627, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9029017686843872, "rewards/tag_count_reward/std": 0.24360375106334686, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1006.6116333007812, "completions/mean_terminated_length": 759.2099609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.18944115923499014, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11797656626089308, "kl": 0.017852783203125, "learning_rate": 9.78418069891023e-07, "loss": 0.0714, "num_tokens": 553862438.0, "reward": 1.407366156578064, "reward_std": 0.33798545598983765, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.22366204857826233, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1233.5535888671875, "completions/mean_terminated_length": 977.994140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1896542539022961, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11769349785186133, "kl": 0.0158538818359375, "learning_rate": 9.783155219677505e-07, "loss": 0.1184, "num_tokens": 554486814.0, "reward": 1.3850446939468384, "reward_std": 0.4460518956184387, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8582589030265808, "rewards/tag_count_reward/std": 0.29347658157348633, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 880.93310546875, "completions/mean_terminated_length": 703.9228515625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.18986734856960205, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15428443063737868, "kl": 0.0216064453125, "learning_rate": 9.782127370046216e-07, "loss": 0.1058, "num_tokens": 554943648.0, "reward": 1.5552456378936768, "reward_std": 0.3522113561630249, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.47195106744766235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9123883843421936, "rewards/tag_count_reward/std": 0.2286706119775772, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 975.3013916015625, "completions/mean_terminated_length": 803.0025634765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.190080443236908, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.712524232452467, "kl": 0.04974365234375, "learning_rate": 9.781097150585194e-07, "loss": 0.0628, "num_tokens": 555459751.0, "reward": 1.4107143878936768, "reward_std": 0.36250707507133484, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.21494385600090027, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 919.9598388671875, "completions/mean_terminated_length": 748.868896484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19029353790421394, "frac_reward_zero_std": 0.0, "grad_norm": 0.15002837168625757, "kl": 0.01947021484375, "learning_rate": 9.780064561864592e-07, "loss": 0.1241, "num_tokens": 555938453.0, "reward": 1.4559152126312256, "reward_std": 0.33153530955314636, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.182987242937088, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 914.30810546875, "completions/mean_terminated_length": 732.21240234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1905066325715199, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13213480531095445, "kl": 0.0205078125, "learning_rate": 9.779029604455863e-07, "loss": 0.0602, "num_tokens": 556418751.0, "reward": 1.4469866752624512, "reward_std": 0.30496519804000854, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995505809783936, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.19061920046806335, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 960.4910888671875, "completions/mean_terminated_length": 811.4415893554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19071972723882585, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13350655927455168, "kl": 0.021453857421875, "learning_rate": 9.777992278931783e-07, "loss": 0.0596, "num_tokens": 556918571.0, "reward": 1.4715402126312256, "reward_std": 0.31978631019592285, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.18212558329105377, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1126.65185546875, "completions/mean_terminated_length": 888.5505981445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1909328219061318, "frac_reward_zero_std": 0.0, "grad_norm": 0.15110988658421268, "kl": 0.01708984375, "learning_rate": 9.776952585866427e-07, "loss": 0.1496, "num_tokens": 557491199.0, "reward": 1.3325893878936768, "reward_std": 0.38315898180007935, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8883928656578064, "rewards/tag_count_reward/std": 0.24879863858222961, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 997.513427734375, "completions/mean_terminated_length": 786.28955078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.19114591657343774, "frac_reward_zero_std": 0.0, "grad_norm": 0.13878204615663806, "kl": 0.022125244140625, "learning_rate": 9.775910525835188e-07, "loss": 0.0717, "num_tokens": 558004741.0, "reward": 1.4888393878936768, "reward_std": 0.33801183104515076, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.23720870912075043, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1021.3504638671875, "completions/mean_terminated_length": 850.2421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1913590112407437, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 218.12230754374687, "kl": 8.798553466796875, "learning_rate": 9.774866099414765e-07, "loss": 0.4774, "num_tokens": 558534578.0, "reward": 1.559709906578064, "reward_std": 0.3949979543685913, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.23508284986019135, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 925.4063110351562, "completions/mean_terminated_length": 731.4502563476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19157210590804966, "frac_reward_zero_std": 0.0, "grad_norm": 0.15162419257587623, "kl": 0.01983642578125, "learning_rate": 9.773819307183168e-07, "loss": 0.0825, "num_tokens": 559017704.0, "reward": 1.4704241752624512, "reward_std": 0.3757508397102356, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.2062104493379593, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 983.263427734375, "completions/mean_terminated_length": 792.7316284179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1917852005753556, "frac_reward_zero_std": 0.0, "grad_norm": 1.070366262144076, "kl": 0.03057861328125, "learning_rate": 9.77277014971972e-07, "loss": 0.0748, "num_tokens": 559532430.0, "reward": 1.481584906578064, "reward_std": 0.36257320642471313, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19802019000053406, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 985.65185546875, "completions/mean_terminated_length": 827.6615600585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19199829524266154, "frac_reward_zero_std": 0.0, "grad_norm": 0.1426730344604902, "kl": 0.019012451171875, "learning_rate": 9.771718627605047e-07, "loss": 0.0807, "num_tokens": 560046370.0, "reward": 1.3577009439468384, "reward_std": 0.4410659372806549, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.22125105559825897, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1115.6138916015625, "completions/mean_terminated_length": 887.697265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1922113899099675, "frac_reward_zero_std": 0.0, "grad_norm": 0.13223184006332392, "kl": 0.018707275390625, "learning_rate": 9.770664741421085e-07, "loss": 0.0663, "num_tokens": 560612837.0, "reward": 1.2935268878936768, "reward_std": 0.39318031072616577, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8917410969734192, "rewards/tag_count_reward/std": 0.24405431747436523, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1065.84375, "completions/mean_terminated_length": 868.3592529296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19242448457727346, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13569956211093284, "kl": 0.01776123046875, "learning_rate": 9.769608491751079e-07, "loss": 0.0955, "num_tokens": 561161311.0, "reward": 1.5407366752624512, "reward_std": 0.31699052453041077, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22521603107452393, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 949.5692138671875, "completions/mean_terminated_length": 782.9691772460938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.19263757924457942, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12752665561561735, "kl": 0.01953125, "learning_rate": 9.768549879179584e-07, "loss": 0.0799, "num_tokens": 561655502.0, "reward": 1.4860491752624512, "reward_std": 0.2688225209712982, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.21122872829437256, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 981.9933471679688, "completions/mean_terminated_length": 820.31103515625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19285067391188535, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12397150682089524, "kl": 0.019775390625, "learning_rate": 9.76748890429246e-07, "loss": 0.0794, "num_tokens": 562163355.0, "reward": 1.4168527126312256, "reward_std": 0.3452180325984955, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.19683773815631866, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1036.243408203125, "completions/mean_terminated_length": 852.0448608398438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1930637685791913, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12215324487839498, "kl": 0.02081298828125, "learning_rate": 9.766425567676879e-07, "loss": 0.0719, "num_tokens": 562688680.0, "reward": 1.520647406578064, "reward_std": 0.3321880102157593, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19764786958694458, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1062.8125, "completions/mean_terminated_length": 818.5737915039062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.19327686324649726, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12175350614508576, "kl": 0.020294189453125, "learning_rate": 9.76535986992131e-07, "loss": 0.0771, "num_tokens": 563235780.0, "reward": 1.4196429252624512, "reward_std": 0.34208735823631287, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.2173006236553192, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 990.7678833007812, "completions/mean_terminated_length": 848.911376953125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.19348995791380322, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12556871762295957, "kl": 0.0185546875, "learning_rate": 9.76429181161554e-07, "loss": 0.0728, "num_tokens": 563744556.0, "reward": 1.4676339626312256, "reward_std": 0.3798133134841919, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.21333187818527222, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1109.10498046875, "completions/mean_terminated_length": 941.0921630859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19370305258110915, "frac_reward_zero_std": 0.0, "grad_norm": 0.11917942891502653, "kl": 0.016265869140625, "learning_rate": 9.763221393350655e-07, "loss": 0.072, "num_tokens": 564316923.0, "reward": 1.2885044813156128, "reward_std": 0.38661423325538635, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.22735659778118134, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 950.7120971679688, "completions/mean_terminated_length": 754.3552856445312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1939161472484151, "frac_reward_zero_std": 0.0, "grad_norm": 0.14304975621724847, "kl": 0.019500732421875, "learning_rate": 9.76214861571905e-07, "loss": 0.1274, "num_tokens": 564820714.0, "reward": 1.4609376192092896, "reward_std": 0.3816933035850525, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.21748150885105133, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 982.1250610351562, "completions/mean_terminated_length": 778.021240234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19412924191572106, "frac_reward_zero_std": 0.0, "grad_norm": 0.14330406809622806, "kl": 0.018280029296875, "learning_rate": 9.761073479314429e-07, "loss": 0.1053, "num_tokens": 565329522.0, "reward": 1.4648438692092896, "reward_std": 0.4017872214317322, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9001116156578064, "rewards/tag_count_reward/std": 0.24218197166919708, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 936.9129638671875, "completions/mean_terminated_length": 771.6743774414062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19434233658302702, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13802183683128744, "kl": 0.019317626953125, "learning_rate": 9.759995984731792e-07, "loss": 0.1062, "num_tokens": 565816235.0, "reward": 1.4246652126312256, "reward_std": 0.34561508893966675, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.2086479365825653, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1087.4754638671875, "completions/mean_terminated_length": 865.8159790039062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.19455543125033295, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12723289260065085, "kl": 0.018402099609375, "learning_rate": 9.758916132567452e-07, "loss": 0.0986, "num_tokens": 566381024.0, "reward": 1.3805804252624512, "reward_std": 0.39436572790145874, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8783482313156128, "rewards/tag_count_reward/std": 0.2643847167491913, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1082.4710693359375, "completions/mean_terminated_length": 859.6566162109375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1947685259176389, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10964048619791143, "kl": 0.0162811279296875, "learning_rate": 9.757833923419027e-07, "loss": 0.0478, "num_tokens": 566941059.0, "reward": 1.3856027126312256, "reward_std": 0.38387662172317505, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19082863628864288, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1084.04248046875, "completions/mean_terminated_length": 864.8411254882812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19498162058494486, "frac_reward_zero_std": 0.0, "grad_norm": 0.23455995557445197, "kl": 0.023468017578125, "learning_rate": 9.756749357885433e-07, "loss": 0.0575, "num_tokens": 567491926.0, "reward": 1.3934152126312256, "reward_std": 0.3877316117286682, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8844866156578064, "rewards/tag_count_reward/std": 0.25672033429145813, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 967.185302734375, "completions/mean_terminated_length": 803.257080078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.19519471525225082, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11767722105789222, "kl": 0.01898193359375, "learning_rate": 9.755662436566897e-07, "loss": 0.0508, "num_tokens": 567994185.0, "reward": 1.4045759439468384, "reward_std": 0.3792615830898285, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20777854323387146, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 944.1295166015625, "completions/mean_terminated_length": 776.704345703125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19540780991955675, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14836465821152875, "kl": 0.019439697265625, "learning_rate": 9.754573160064944e-07, "loss": 0.0885, "num_tokens": 568483139.0, "reward": 1.5139509439468384, "reward_std": 0.35800066590309143, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.20454494655132294, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 924.7879638671875, "completions/mean_terminated_length": 774.0784912109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1956209045868627, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13832063026854555, "kl": 0.021636962890625, "learning_rate": 9.753481528982407e-07, "loss": 0.0423, "num_tokens": 568970276.0, "reward": 1.4263393878936768, "reward_std": 0.3221184313297272, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8950892686843872, "rewards/tag_count_reward/std": 0.2296247035264969, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 964.40185546875, "completions/mean_terminated_length": 773.8477783203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19583399925416867, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13684636074427955, "kl": 0.020751953125, "learning_rate": 9.75238754392342e-07, "loss": 0.0959, "num_tokens": 569467960.0, "reward": 1.485491156578064, "reward_std": 0.3128793239593506, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18356364965438843, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1100.875, "completions/mean_terminated_length": 907.3763427734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19604709392147462, "frac_reward_zero_std": 0.0, "grad_norm": 0.12732427143682615, "kl": 0.0170135498046875, "learning_rate": 9.751291205493421e-07, "loss": 0.1193, "num_tokens": 570034688.0, "reward": 1.2712054252624512, "reward_std": 0.4475001096725464, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8805803656578064, "rewards/tag_count_reward/std": 0.24965769052505493, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 970.2723388671875, "completions/mean_terminated_length": 806.8123168945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19626018858878055, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1363261609007378, "kl": 0.019195556640625, "learning_rate": 9.750192514299148e-07, "loss": 0.1066, "num_tokens": 570543770.0, "reward": 1.4179688692092896, "reward_std": 0.32332125306129456, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.2093229591846466, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 978.7031860351562, "completions/mean_terminated_length": 803.7272338867188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.1964732832560865, "frac_reward_zero_std": 0.0, "grad_norm": 0.6075809438990477, "kl": 0.02862548828125, "learning_rate": 9.749091470948643e-07, "loss": 0.0856, "num_tokens": 571047877.0, "reward": 1.4051339626312256, "reward_std": 0.41178596019744873, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2339629977941513, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1040.415283203125, "completions/mean_terminated_length": 850.6578369140625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19668637792339247, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13129730402658027, "kl": 0.020111083984375, "learning_rate": 9.74798807605125e-07, "loss": 0.1109, "num_tokens": 571580575.0, "reward": 1.419084906578064, "reward_std": 0.3938526213169098, "rewards/accuracy_reward/mean": 0.5462962985038757, "rewards/accuracy_reward/std": 0.4984292685985565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8922991156578064, "rewards/tag_count_reward/std": 0.24171243607997894, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1082.2210693359375, "completions/mean_terminated_length": 808.2608032226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19689947259069843, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11678656002404714, "kl": 0.0174560546875, "learning_rate": 9.74688233021761e-07, "loss": 0.0938, "num_tokens": 572135458.0, "reward": 1.3705357313156128, "reward_std": 0.32974135875701904, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8950892686843872, "rewards/tag_count_reward/std": 0.2460842877626419, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 945.3906860351562, "completions/mean_terminated_length": 737.7373657226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19711256725800436, "frac_reward_zero_std": 0.0, "grad_norm": 0.1459764898162939, "kl": 0.02191162109375, "learning_rate": 9.745774234059673e-07, "loss": 0.0963, "num_tokens": 572631121.0, "reward": 1.360491156578064, "reward_std": 0.3288823068141937, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2594851851463318, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1054.560302734375, "completions/mean_terminated_length": 815.14404296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19732566192531031, "frac_reward_zero_std": 0.0, "grad_norm": 0.14028015448548256, "kl": 0.017425537109375, "learning_rate": 9.744663788190685e-07, "loss": 0.0873, "num_tokens": 573169756.0, "reward": 1.3805804252624512, "reward_std": 0.3514879047870636, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8917410969734192, "rewards/tag_count_reward/std": 0.24462655186653137, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1018.1317138671875, "completions/mean_terminated_length": 814.3609619140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.19753875659261627, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.6730480984492418, "kl": 0.026824951171875, "learning_rate": 9.743550993225188e-07, "loss": 0.1333, "num_tokens": 573691767.0, "reward": 1.399553656578064, "reward_std": 0.3845144212245941, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.24265125393867493, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 994.9285888671875, "completions/mean_terminated_length": 786.5668334960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19775185125992223, "frac_reward_zero_std": 0.0, "grad_norm": 2.4178950857908417, "kl": 0.136962890625, "learning_rate": 9.742435849779036e-07, "loss": 0.0643, "num_tokens": 574219575.0, "reward": 1.356584906578064, "reward_std": 0.27073487639427185, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8922991156578064, "rewards/tag_count_reward/std": 0.22180332243442535, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1133.930908203125, "completions/mean_terminated_length": 825.6029663085938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.19796494592722816, "frac_reward_zero_std": 0.0, "grad_norm": 0.13873784544169512, "kl": 0.018157958984375, "learning_rate": 9.741318358469371e-07, "loss": 0.1329, "num_tokens": 574802744.0, "reward": 1.2650669813156128, "reward_std": 0.3683628439903259, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8677455186843872, "rewards/tag_count_reward/std": 0.26798370480537415, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1056.1875, "completions/mean_terminated_length": 881.7742919921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.19817804059453412, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12246915877249327, "kl": 0.01800537109375, "learning_rate": 9.740198519914637e-07, "loss": 0.0594, "num_tokens": 575345868.0, "reward": 1.4626116752624512, "reward_std": 0.35262539982795715, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.19353099167346954, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1155.97998046875, "completions/mean_terminated_length": 906.2142944335938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.19839113526184007, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12915501802536825, "kl": 0.01666259765625, "learning_rate": 9.739076334734585e-07, "loss": 0.0976, "num_tokens": 575937891.0, "reward": 1.2767857313156128, "reward_std": 0.4358396828174591, "rewards/accuracy_reward/mean": 0.4447115361690521, "rewards/accuracy_reward/std": 0.4975321590900421, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8638392686843872, "rewards/tag_count_reward/std": 0.28406286239624023, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1091.430908203125, "completions/mean_terminated_length": 899.0911865234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.19860422992914603, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12984489417640183, "kl": 0.019134521484375, "learning_rate": 9.737951803550256e-07, "loss": 0.0688, "num_tokens": 576489092.0, "reward": 1.3236607313156128, "reward_std": 0.42935460805892944, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.26121383905410767, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 954.6406860351562, "completions/mean_terminated_length": 755.5857543945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19881732459645196, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.4650473472878477, "kl": 0.04248046875, "learning_rate": 9.73682492698399e-07, "loss": 0.0613, "num_tokens": 576989411.0, "reward": 1.380022406578064, "reward_std": 0.34238237142562866, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.24317994713783264, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 953.0826416015625, "completions/mean_terminated_length": 799.849853515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19903041926375792, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13306694495655733, "kl": 0.019744873046875, "learning_rate": 9.735695705659428e-07, "loss": 0.0816, "num_tokens": 577484504.0, "reward": 1.4916294813156128, "reward_std": 0.3948051631450653, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.21563898026943207, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1036.950927734375, "completions/mean_terminated_length": 840.13330078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19924351393106388, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12593073542131225, "kl": 0.018402099609375, "learning_rate": 9.734564140201506e-07, "loss": 0.0807, "num_tokens": 578014226.0, "reward": 1.430803656578064, "reward_std": 0.4254690706729889, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.2155933827161789, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 966.7813110351562, "completions/mean_terminated_length": 745.8870849609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.19945660859836983, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1410035352138314, "kl": 0.017333984375, "learning_rate": 9.733430231236462e-07, "loss": 0.1123, "num_tokens": 578517568.0, "reward": 1.3939732313156128, "reward_std": 0.3960471451282501, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.26002347469329834, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1029.1607666015625, "completions/mean_terminated_length": 853.1309204101562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.19966970326567576, "frac_reward_zero_std": 0.0, "grad_norm": 0.12785298909646056, "kl": 0.017852783203125, "learning_rate": 9.732293979391826e-07, "loss": 0.0969, "num_tokens": 579045784.0, "reward": 1.4536831378936768, "reward_std": 0.4685271382331848, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8733258843421936, "rewards/tag_count_reward/std": 0.2606726288795471, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1041.024658203125, "completions/mean_terminated_length": 794.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.19988279793298172, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.3709747368676183, "kl": 0.03118896484375, "learning_rate": 9.731155385296428e-07, "loss": 0.0539, "num_tokens": 579585123.0, "reward": 1.3705357313156128, "reward_std": 0.4214267432689667, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.27125784754753113, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1168.3304443359375, "completions/mean_terminated_length": 875.107177734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20009589260028768, "frac_reward_zero_std": 0.0, "grad_norm": 0.11865719025963911, "kl": 0.016998291015625, "learning_rate": 9.730014449580391e-07, "loss": 0.0753, "num_tokens": 580181543.0, "reward": 1.2739956378936768, "reward_std": 0.47678273916244507, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8565848469734192, "rewards/tag_count_reward/std": 0.27262529730796814, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 974.0826416015625, "completions/mean_terminated_length": 754.6801147460938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.20030898726759364, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12893348555157633, "kl": 0.018341064453125, "learning_rate": 9.728871172875137e-07, "loss": 0.0516, "num_tokens": 580685884.0, "reward": 1.376116156578064, "reward_std": 0.35923895239830017, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.2523038983345032, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1069.0023193359375, "completions/mean_terminated_length": 829.6917114257812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.20052208193489957, "frac_reward_zero_std": 0.0, "grad_norm": 0.1423227322672156, "kl": 0.020416259765625, "learning_rate": 9.727725555813383e-07, "loss": 0.085, "num_tokens": 581233565.0, "reward": 1.30859375, "reward_std": 0.4114075303077698, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8777901530265808, "rewards/tag_count_reward/std": 0.2643912136554718, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1026.638427734375, "completions/mean_terminated_length": 790.9395751953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20073517660220552, "frac_reward_zero_std": 0.0, "grad_norm": 0.15731581995196475, "kl": 0.018951416015625, "learning_rate": 9.726577599029134e-07, "loss": 0.1021, "num_tokens": 581766891.0, "reward": 1.4207589626312256, "reward_std": 0.43133798241615295, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.251193106174469, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1126.0023193359375, "completions/mean_terminated_length": 836.6950073242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20094827126951148, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12358701980067863, "kl": 0.016143798828125, "learning_rate": 9.725427303157703e-07, "loss": 0.0376, "num_tokens": 582337580.0, "reward": 1.3141741752624512, "reward_std": 0.4282354414463043, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8699776530265808, "rewards/tag_count_reward/std": 0.26223400235176086, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1195.794677734375, "completions/mean_terminated_length": 904.9221801757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20116136593681744, "frac_reward_zero_std": 0.0, "grad_norm": 0.11546037429775202, "kl": 0.014984130859375, "learning_rate": 9.72427466883569e-07, "loss": 0.0923, "num_tokens": 582949344.0, "reward": 1.2287946939468384, "reward_std": 0.4299945831298828, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.295629620552063, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1017.6406860351562, "completions/mean_terminated_length": 793.6494750976562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2013744606041234, "frac_reward_zero_std": 0.0, "grad_norm": 0.12424074605822494, "kl": 0.01788330078125, "learning_rate": 9.723119696700987e-07, "loss": 0.0703, "num_tokens": 583470175.0, "reward": 1.4419643878936768, "reward_std": 0.3329524099826813, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8683035969734192, "rewards/tag_count_reward/std": 0.26590317487716675, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1114.732177734375, "completions/mean_terminated_length": 829.0379028320312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.20158755527142933, "frac_reward_zero_std": 0.0, "grad_norm": 0.13189317693351543, "kl": 0.017059326171875, "learning_rate": 9.721962387392784e-07, "loss": 0.1039, "num_tokens": 584038135.0, "reward": 1.2779018878936768, "reward_std": 0.3820245862007141, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8448660969734192, "rewards/tag_count_reward/std": 0.28168630599975586, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1045.0023193359375, "completions/mean_terminated_length": 830.268310546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.20180064993873528, "frac_reward_zero_std": 0.0, "grad_norm": 10.60896568534435, "kl": 0.195281982421875, "learning_rate": 9.720802741551565e-07, "loss": 0.1206, "num_tokens": 584572952.0, "reward": 1.5000001192092896, "reward_std": 0.4759651720523834, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8660714030265808, "rewards/tag_count_reward/std": 0.26159584522247314, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1046.2879638671875, "completions/mean_terminated_length": 808.3121948242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20201374460604124, "frac_reward_zero_std": 0.0, "grad_norm": 0.1278418720203623, "kl": 0.0176239013671875, "learning_rate": 9.7196407598191e-07, "loss": 0.1014, "num_tokens": 585113065.0, "reward": 1.4313616752624512, "reward_std": 0.4454249441623688, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8599330186843872, "rewards/tag_count_reward/std": 0.2794141173362732, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1114.1160888671875, "completions/mean_terminated_length": 776.3282470703125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2022268392733472, "frac_reward_zero_std": 0.0, "grad_norm": 0.12959005263771578, "kl": 0.01763916015625, "learning_rate": 9.718476442838464e-07, "loss": 0.0878, "num_tokens": 585682429.0, "reward": 1.3325893878936768, "reward_std": 0.3996261954307556, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8482142686843872, "rewards/tag_count_reward/std": 0.2835085690021515, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1121.384033203125, "completions/mean_terminated_length": 878.6365966796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20243993394065313, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12983375769191466, "kl": 0.0159912109375, "learning_rate": 9.71730979125401e-07, "loss": 0.0948, "num_tokens": 586257257.0, "reward": 1.258928656578064, "reward_std": 0.4123886823654175, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8348214030265808, "rewards/tag_count_reward/std": 0.29877373576164246, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1118.1160888671875, "completions/mean_terminated_length": 897.2044677734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20265302860795908, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12899402720112513, "kl": 0.02099609375, "learning_rate": 9.7161408057114e-07, "loss": 0.0834, "num_tokens": 586826573.0, "reward": 1.426897406578064, "reward_std": 0.406277596950531, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8643973469734192, "rewards/tag_count_reward/std": 0.26472151279449463, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 990.9308471679688, "completions/mean_terminated_length": 754.10107421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20286612327526504, "frac_reward_zero_std": 0.0, "grad_norm": 0.13524494729078168, "kl": 0.02020263671875, "learning_rate": 9.714969486857567e-07, "loss": 0.0669, "num_tokens": 587334478.0, "reward": 1.4626116752624512, "reward_std": 0.41052141785621643, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2727077007293701, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1147.2701416015625, "completions/mean_terminated_length": 951.459228515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.203079217942571, "frac_reward_zero_std": 0.0, "grad_norm": 0.11986755276536806, "kl": 0.0160369873046875, "learning_rate": 9.713795835340753e-07, "loss": 0.0699, "num_tokens": 587922439.0, "reward": 1.4765626192092896, "reward_std": 0.44435131549835205, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8627232313156128, "rewards/tag_count_reward/std": 0.27348312735557556, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1094.28125, "completions/mean_terminated_length": 861.1500244140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20329231260987693, "frac_reward_zero_std": 0.0, "grad_norm": 0.11733210974900807, "kl": 0.0179595947265625, "learning_rate": 9.712619851810482e-07, "loss": 0.0794, "num_tokens": 588480805.0, "reward": 1.4402902126312256, "reward_std": 0.38445407152175903, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2626193165779114, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1032.18310546875, "completions/mean_terminated_length": 840.8753051757812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2035054072771829, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14104131828314123, "kl": 0.0194091796875, "learning_rate": 9.711441536917573e-07, "loss": 0.049, "num_tokens": 589009511.0, "reward": 1.422991156578064, "reward_std": 0.4051731824874878, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8761160969734192, "rewards/tag_count_reward/std": 0.2525017559528351, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1006.1094360351562, "completions/mean_terminated_length": 758.5884399414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.20371850194448884, "frac_reward_zero_std": 0.0, "grad_norm": 0.14643915796263005, "kl": 0.019134521484375, "learning_rate": 9.710260891314131e-07, "loss": 0.1108, "num_tokens": 589528568.0, "reward": 1.4626116752624512, "reward_std": 0.39833319187164307, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8889508843421936, "rewards/tag_count_reward/std": 0.2425065040588379, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1052.4910888671875, "completions/mean_terminated_length": 849.1075439453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2039315966117948, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13649285890991408, "kl": 0.0186767578125, "learning_rate": 9.709077915653552e-07, "loss": 0.1267, "num_tokens": 590075956.0, "reward": 1.4419643878936768, "reward_std": 0.37415704131126404, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8638392686843872, "rewards/tag_count_reward/std": 0.2709631323814392, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1075.071533203125, "completions/mean_terminated_length": 816.72314453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20414469127910073, "frac_reward_zero_std": 0.0, "grad_norm": 0.13186539270898398, "kl": 0.017486572265625, "learning_rate": 9.707892610590526e-07, "loss": 0.132, "num_tokens": 590623316.0, "reward": 1.3844866752624512, "reward_std": 0.4485476613044739, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8777901530265808, "rewards/tag_count_reward/std": 0.2574244737625122, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1113.32373046875, "completions/mean_terminated_length": 878.34912109375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2043577859464067, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10905023303057652, "kl": 0.0164947509765625, "learning_rate": 9.70670497678103e-07, "loss": -0.0068, "num_tokens": 591186837.0, "reward": 1.3973214626312256, "reward_std": 0.30831965804100037, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.21637944877147675, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 982.950927734375, "completions/mean_terminated_length": 765.3602294921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20457088061371265, "frac_reward_zero_std": 0.0, "grad_norm": 0.1307998086008599, "kl": 0.01953125, "learning_rate": 9.70551501488232e-07, "loss": 0.1472, "num_tokens": 591694015.0, "reward": 1.4631696939468384, "reward_std": 0.3534601926803589, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.2444223314523697, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1060.77685546875, "completions/mean_terminated_length": 819.45556640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2047839752810186, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13679705701679346, "kl": 0.0171051025390625, "learning_rate": 9.704322725552956e-07, "loss": 0.1244, "num_tokens": 592242475.0, "reward": 1.3303571939468384, "reward_std": 0.34770551323890686, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8883928656578064, "rewards/tag_count_reward/std": 0.24710693955421448, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 971.0826416015625, "completions/mean_terminated_length": 747.5714111328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20499706994832453, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.2458819804723144, "kl": 0.026763916015625, "learning_rate": 9.703128109452775e-07, "loss": 0.125, "num_tokens": 592753440.0, "reward": 1.4520089626312256, "reward_std": 0.326716810464859, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8783482313156128, "rewards/tag_count_reward/std": 0.2590422034263611, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 981.7076416015625, "completions/mean_terminated_length": 780.8938598632812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2052101646156305, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.670081992700361, "kl": 0.018829345703125, "learning_rate": 9.70193116724291e-07, "loss": 0.0791, "num_tokens": 593264813.0, "reward": 1.422991156578064, "reward_std": 0.3601451814174652, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8783482313156128, "rewards/tag_count_reward/std": 0.25468748807907104, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1033.321533203125, "completions/mean_terminated_length": 809.373291015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20542325928293645, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.135578193296156, "kl": 0.018798828125, "learning_rate": 9.700731899585773e-07, "loss": 0.083, "num_tokens": 593799933.0, "reward": 1.387834906578064, "reward_std": 0.3177962899208069, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.1878284513950348, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 988.8683471679688, "completions/mean_terminated_length": 818.7486572265625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2056363539502424, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13332120652966487, "kl": 0.018829345703125, "learning_rate": 9.699530307145067e-07, "loss": 0.0661, "num_tokens": 594313154.0, "reward": 1.4135044813156128, "reward_std": 0.36688777804374695, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.21394765377044678, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1111.7523193359375, "completions/mean_terminated_length": 859.7875366210938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.20584944861754834, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12360263314184435, "kl": 0.016204833984375, "learning_rate": 9.698326390585784e-07, "loss": 0.1081, "num_tokens": 594880867.0, "reward": 1.3532366752624512, "reward_std": 0.3957882225513458, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.24432718753814697, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 977.3594360351562, "completions/mean_terminated_length": 795.657958984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2060625432848543, "frac_reward_zero_std": 0.0, "grad_norm": 1.9560655732420187, "kl": 0.020111083984375, "learning_rate": 9.697120150574198e-07, "loss": 0.1219, "num_tokens": 595383156.0, "reward": 1.4056919813156128, "reward_std": 0.38246452808380127, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9034598469734192, "rewards/tag_count_reward/std": 0.2231668382883072, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1062.513427734375, "completions/mean_terminated_length": 848.2771606445312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.20627563795216025, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12946985131841315, "kl": 0.02044677734375, "learning_rate": 9.695911587777873e-07, "loss": 0.0726, "num_tokens": 595926058.0, "reward": 1.469866156578064, "reward_std": 0.43958985805511475, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.25340983271598816, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1058.5535888671875, "completions/mean_terminated_length": 820.0996704101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2064887326194662, "frac_reward_zero_std": 0.0, "grad_norm": 0.13937668542452109, "kl": 0.019561767578125, "learning_rate": 9.69470070286565e-07, "loss": 0.0981, "num_tokens": 596477010.0, "reward": 1.4023438692092896, "reward_std": 0.3652377426624298, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8978794813156128, "rewards/tag_count_reward/std": 0.2424035370349884, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1063.203125, "completions/mean_terminated_length": 852.3658447265625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.20670182728677214, "frac_reward_zero_std": 0.0, "grad_norm": 0.17332028545550354, "kl": 0.017547607421875, "learning_rate": 9.693487496507668e-07, "loss": 0.1219, "num_tokens": 597021725.0, "reward": 1.391741156578064, "reward_std": 0.37701690196990967, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.23568548262119293, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1080.24560546875, "completions/mean_terminated_length": 819.8016967773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2069149219540781, "frac_reward_zero_std": 0.0, "grad_norm": 0.1269790332033462, "kl": 0.017303466796875, "learning_rate": 9.692271969375341e-07, "loss": 0.0978, "num_tokens": 597572923.0, "reward": 1.3872768878936768, "reward_std": 0.44391947984695435, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8738839030265808, "rewards/tag_count_reward/std": 0.2654591202735901, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1104.375, "completions/mean_terminated_length": 836.6991577148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20712801662138405, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12195207244859864, "kl": 0.017547607421875, "learning_rate": 9.691054122141368e-07, "loss": 0.0343, "num_tokens": 598140963.0, "reward": 1.3632813692092896, "reward_std": 0.38065415620803833, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23745527863502502, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1120.25, "completions/mean_terminated_length": 890.2506713867188, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.20734111128869, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11576084163125788, "kl": 0.01727294921875, "learning_rate": 9.689833955479737e-07, "loss": 0.1008, "num_tokens": 598709891.0, "reward": 1.481584906578064, "reward_std": 0.34943288564682007, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.24384641647338867, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1031.044677734375, "completions/mean_terminated_length": 771.8207397460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20755420595599594, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1299264245472782, "kl": 0.02093505859375, "learning_rate": 9.688611470065716e-07, "loss": 0.1242, "num_tokens": 599236951.0, "reward": 1.4888393878936768, "reward_std": 0.428301066160202, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2481052279472351, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1149.2410888671875, "completions/mean_terminated_length": 890.9769897460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2077673006233019, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 1.0732657490736828, "kl": 0.0911865234375, "learning_rate": 9.687386666575858e-07, "loss": 0.108, "num_tokens": 599824643.0, "reward": 1.3844866752624512, "reward_std": 0.4030228555202484, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8376116156578064, "rewards/tag_count_reward/std": 0.3123975992202759, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1089.33935546875, "completions/mean_terminated_length": 827.8864135742188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.20798039529060786, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13010300635074162, "kl": 0.018310546875, "learning_rate": 9.686159545687996e-07, "loss": 0.1124, "num_tokens": 600384011.0, "reward": 1.2918527126312256, "reward_std": 0.366824746131897, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2403814047574997, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1138.493408203125, "completions/mean_terminated_length": 909.8463134765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2081934899579138, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11307137668872197, "kl": 0.017059326171875, "learning_rate": 9.684930108081249e-07, "loss": 0.0679, "num_tokens": 600965336.0, "reward": 1.3867188692092896, "reward_std": 0.3642430603504181, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.2267407774925232, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1137.055908203125, "completions/mean_terminated_length": 861.654052734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20840658462521974, "frac_reward_zero_std": 0.0, "grad_norm": 0.1240092416090791, "kl": 0.016754150390625, "learning_rate": 9.683698354436016e-07, "loss": 0.1253, "num_tokens": 601545697.0, "reward": 1.3727679252624512, "reward_std": 0.39141160249710083, "rewards/accuracy_reward/mean": 0.49074074625968933, "rewards/accuracy_reward/std": 0.5004938244819641, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8995535969734192, "rewards/tag_count_reward/std": 0.24453723430633545, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1168.044677734375, "completions/mean_terminated_length": 905.3333740234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2086196792925257, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11952483737327563, "kl": 0.0171661376953125, "learning_rate": 9.682464285433979e-07, "loss": 0.112, "num_tokens": 602139285.0, "reward": 1.3108259439468384, "reward_std": 0.3328397274017334, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8844866156578064, "rewards/tag_count_reward/std": 0.2631748914718628, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1042.046875, "completions/mean_terminated_length": 820.0245361328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20883277395983166, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1375594131993578, "kl": 0.01861572265625, "learning_rate": 9.681227901758101e-07, "loss": 0.1376, "num_tokens": 602675402.0, "reward": 1.4280134439468384, "reward_std": 0.36848947405815125, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8967633843421936, "rewards/tag_count_reward/std": 0.2407706081867218, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1055.0379638671875, "completions/mean_terminated_length": 801.9299926757812, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.20904586862713762, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12567605039222451, "kl": 0.018524169921875, "learning_rate": 9.679989204092624e-07, "loss": 0.0777, "num_tokens": 603214923.0, "reward": 1.563616156578064, "reward_std": 0.35769954323768616, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.20945784449577332, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1084.3929443359375, "completions/mean_terminated_length": 858.7548217773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20925896329444355, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.130230622770987, "kl": 0.01715087890625, "learning_rate": 9.678748193123075e-07, "loss": 0.1088, "num_tokens": 603761483.0, "reward": 1.3895089626312256, "reward_std": 0.3803202211856842, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.22568464279174805, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1104.602783203125, "completions/mean_terminated_length": 870.7242431640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2094720579617495, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12281837312874369, "kl": 0.0163116455078125, "learning_rate": 9.677504869536255e-07, "loss": 0.0622, "num_tokens": 604324297.0, "reward": 1.4246652126312256, "reward_std": 0.31938737630844116, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.1837497353553772, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1051.3013916015625, "completions/mean_terminated_length": 850.8927612304688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.20968515262905546, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12220977660752079, "kl": 0.018341064453125, "learning_rate": 9.67625923402025e-07, "loss": 0.0494, "num_tokens": 604861360.0, "reward": 1.3867188692092896, "reward_std": 0.35236823558807373, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21263070404529572, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1087.8348388671875, "completions/mean_terminated_length": 910.0264282226562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.20989824729636142, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11963464126886837, "kl": 0.01898193359375, "learning_rate": 9.675011287264427e-07, "loss": 0.0675, "num_tokens": 605419510.0, "reward": 1.3030134439468384, "reward_std": 0.33659690618515015, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.4903542101383209, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9034598469734192, "rewards/tag_count_reward/std": 0.2347692996263504, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 998.779052734375, "completions/mean_terminated_length": 784.4220581054688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21011134196366735, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1318557569983656, "kl": 0.017242431640625, "learning_rate": 9.673761029959426e-07, "loss": 0.098, "num_tokens": 605930915.0, "reward": 1.4475446939468384, "reward_std": 0.37351909279823303, "rewards/accuracy_reward/mean": 0.5509259104728699, "rewards/accuracy_reward/std": 0.49797651171684265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.22552968561649323, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1087.180908203125, "completions/mean_terminated_length": 814.6275634765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2103244366309733, "frac_reward_zero_std": 0.0, "grad_norm": 0.11838296641766269, "kl": 0.0165557861328125, "learning_rate": 9.672508462797168e-07, "loss": 0.0581, "num_tokens": 606491892.0, "reward": 1.2717634439468384, "reward_std": 0.3784574270248413, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8833705186843872, "rewards/tag_count_reward/std": 0.2684724032878876, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1014.4598388671875, "completions/mean_terminated_length": 796.578369140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21053753129827926, "frac_reward_zero_std": 0.0, "grad_norm": 0.1277266733273894, "kl": 0.01873779296875, "learning_rate": 9.671253586470854e-07, "loss": 0.1051, "num_tokens": 607018114.0, "reward": 1.4977679252624512, "reward_std": 0.36809930205345154, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.20721976459026337, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 845.325927734375, "completions/mean_terminated_length": 714.341552734375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.21075062596558522, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13946916096495987, "kl": 0.02032470703125, "learning_rate": 9.669996401674963e-07, "loss": 0.0738, "num_tokens": 607459892.0, "reward": 1.6556919813156128, "reward_std": 0.3361762762069702, "rewards/accuracy_reward/mean": 0.7142857313156128, "rewards/accuracy_reward/std": 0.45225897431373596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1818031221628189, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 993.7500610351562, "completions/mean_terminated_length": 798.5184936523438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.21096372063289115, "frac_reward_zero_std": 0.0, "grad_norm": 0.13296140771030554, "kl": 0.019775390625, "learning_rate": 9.66873690910525e-07, "loss": 0.0335, "num_tokens": 607980164.0, "reward": 1.4977679252624512, "reward_std": 0.35170823335647583, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.2077612727880478, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1008.5870971679688, "completions/mean_terminated_length": 802.9277954101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2111768153001971, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13610661049162912, "kl": 0.019073486328125, "learning_rate": 9.667475109458747e-07, "loss": 0.1097, "num_tokens": 608494155.0, "reward": 1.4341518878936768, "reward_std": 0.36671048402786255, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20146164298057556, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 949.76123046875, "completions/mean_terminated_length": 749.8179931640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.21138990996750306, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13650530439734018, "kl": 0.021575927734375, "learning_rate": 9.66621100343376e-07, "loss": 0.0825, "num_tokens": 608983984.0, "reward": 1.4475446939468384, "reward_std": 0.31377193331718445, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.1880783587694168, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1067.55810546875, "completions/mean_terminated_length": 803.69970703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21160300463480902, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11904261223702872, "kl": 0.018585205078125, "learning_rate": 9.664944591729884e-07, "loss": 0.0986, "num_tokens": 609541210.0, "reward": 1.290178656578064, "reward_std": 0.38875582814216614, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8839285969734192, "rewards/tag_count_reward/std": 0.25728440284729004, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1081.7076416015625, "completions/mean_terminated_length": 778.50146484375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21181609930211495, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15340419817053028, "kl": 0.01947021484375, "learning_rate": 9.663675875047974e-07, "loss": 0.0662, "num_tokens": 610101431.0, "reward": 1.4006696939468384, "reward_std": 0.32147231698036194, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24499371647834778, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 942.4375610351562, "completions/mean_terminated_length": 764.860107421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2120291939694209, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13584641220068627, "kl": 0.019439697265625, "learning_rate": 9.662404854090171e-07, "loss": 0.0676, "num_tokens": 610590763.0, "reward": 1.5117188692092896, "reward_std": 0.34111541509628296, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2073153853416443, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 911.0960083007812, "completions/mean_terminated_length": 742.0179443359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21224228863672687, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1248384385399846, "kl": 0.02117919921875, "learning_rate": 9.661131529559883e-07, "loss": 0.0479, "num_tokens": 611061974.0, "reward": 1.5837054252624512, "reward_std": 0.3017288148403168, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18348205089569092, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 933.7902221679688, "completions/mean_terminated_length": 734.4052734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21245538330403282, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13126097036720397, "kl": 0.022003173828125, "learning_rate": 9.659855902161804e-07, "loss": 0.0855, "num_tokens": 611548440.0, "reward": 1.6484376192092896, "reward_std": 0.3285175859928131, "rewards/accuracy_reward/mean": 0.7053571343421936, "rewards/accuracy_reward/std": 0.45639166235923767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.17569643259048462, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1068.5067138671875, "completions/mean_terminated_length": 855.5733642578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21266847797133875, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12232301469490667, "kl": 0.0171051025390625, "learning_rate": 9.65857797260189e-07, "loss": 0.0551, "num_tokens": 612091419.0, "reward": 1.356584906578064, "reward_std": 0.3327273428440094, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.20082469284534454, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 989.7678833007812, "completions/mean_terminated_length": 763.2086791992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2128815726386447, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.7245688070990458, "kl": 0.0257568359375, "learning_rate": 9.657297741587381e-07, "loss": 0.0279, "num_tokens": 612617267.0, "reward": 1.4709821939468384, "reward_std": 0.3218078017234802, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.21720866858959198, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1081.8773193359375, "completions/mean_terminated_length": 845.7139282226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21309466730595067, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12125873848436415, "kl": 0.017578125, "learning_rate": 9.656015209826788e-07, "loss": 0.0575, "num_tokens": 613170508.0, "reward": 1.4213169813156128, "reward_std": 0.3136279284954071, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.21439646184444427, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1204.328125, "completions/mean_terminated_length": 902.6514892578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21330776197325663, "frac_reward_zero_std": 0.0, "grad_norm": 0.13076941046090076, "kl": 0.01617431640625, "learning_rate": 9.654730378029892e-07, "loss": 0.1085, "num_tokens": 613780783.0, "reward": 1.2126116752624512, "reward_std": 0.3802720606327057, "rewards/accuracy_reward/mean": 0.3214285671710968, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8911830186843872, "rewards/tag_count_reward/std": 0.2547432482242584, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 952.4620971679688, "completions/mean_terminated_length": 811.7254028320312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21352085664056256, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13414171984312873, "kl": 0.021392822265625, "learning_rate": 9.653443246907748e-07, "loss": 0.0751, "num_tokens": 614274334.0, "reward": 1.5412946939468384, "reward_std": 0.3479004204273224, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.18955937027931213, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 944.7857666015625, "completions/mean_terminated_length": 764.2597045898438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2137339513078685, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12487884729558162, "kl": 0.021026611328125, "learning_rate": 9.652153817172686e-07, "loss": 0.0764, "num_tokens": 614770654.0, "reward": 1.4849331378936768, "reward_std": 0.3466637432575226, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21440230309963226, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 936.997802734375, "completions/mean_terminated_length": 784.7283935546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21394704597517447, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1334543784185448, "kl": 0.020751953125, "learning_rate": 9.650862089538307e-07, "loss": 0.0746, "num_tokens": 615259501.0, "reward": 1.4681919813156128, "reward_std": 0.3186980187892914, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.19992130994796753, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1122.5982666015625, "completions/mean_terminated_length": 853.2449340820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21416014064248043, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.130870093656659, "kl": 0.0172882080078125, "learning_rate": 9.649568064719482e-07, "loss": 0.1053, "num_tokens": 615839913.0, "reward": 1.2946429252624512, "reward_std": 0.35064026713371277, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23377349972724915, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1087.259033203125, "completions/mean_terminated_length": 884.724365234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2143732353097864, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11480697121033301, "kl": 0.017730712890625, "learning_rate": 9.648271743432355e-07, "loss": 0.054, "num_tokens": 616394077.0, "reward": 1.4960938692092896, "reward_std": 0.4023757874965668, "rewards/accuracy_reward/mean": 0.6064814925193787, "rewards/accuracy_reward/std": 0.4890965521335602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.22577519714832306, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1120.54248046875, "completions/mean_terminated_length": 860.854248046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21458632997709232, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13828937044665743, "kl": 0.016998291015625, "learning_rate": 9.646973126394341e-07, "loss": 0.1485, "num_tokens": 616973168.0, "reward": 1.2388393878936768, "reward_std": 0.388698935508728, "rewards/accuracy_reward/mean": 0.3541666567325592, "rewards/accuracy_reward/std": 0.4788145422935486, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.24759145081043243, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1159.546875, "completions/mean_terminated_length": 914.0199584960938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.21479942464439827, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12224700834142749, "kl": 0.016021728515625, "learning_rate": 9.64567221432413e-07, "loss": 0.0752, "num_tokens": 617563621.0, "reward": 1.340959906578064, "reward_std": 0.33923614025115967, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2053246796131134, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1085.8951416015625, "completions/mean_terminated_length": 854.0304565429688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.21501251931170423, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14645432242150813, "kl": 0.0203857421875, "learning_rate": 9.644369007941667e-07, "loss": 0.1271, "num_tokens": 618128566.0, "reward": 1.333147406578064, "reward_std": 0.34795358777046204, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.2119780033826828, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1019.1808471679688, "completions/mean_terminated_length": 778.2727661132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2152256139790102, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1338888170324766, "kl": 0.0184326171875, "learning_rate": 9.643063507968185e-07, "loss": 0.1049, "num_tokens": 618658055.0, "reward": 1.4481027126312256, "reward_std": 0.37270089983940125, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8989955186843872, "rewards/tag_count_reward/std": 0.24344676733016968, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1167.625, "completions/mean_terminated_length": 949.3704833984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21543870864631612, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11457849712008734, "kl": 0.0164794921875, "learning_rate": 9.641755715126176e-07, "loss": 0.0632, "num_tokens": 619253631.0, "reward": 1.3621652126312256, "reward_std": 0.3880992829799652, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.22919411957263947, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1028.243408203125, "completions/mean_terminated_length": 836.193603515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21565180331362208, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12193408775829491, "kl": 0.018157958984375, "learning_rate": 9.6404456301394e-07, "loss": 0.0259, "num_tokens": 619780972.0, "reward": 1.3627232313156128, "reward_std": 0.2980118989944458, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.18280036747455597, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1079.779052734375, "completions/mean_terminated_length": 839.7465209960938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.21586489798092803, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.8562240210019115, "kl": 0.06195068359375, "learning_rate": 9.639133253732895e-07, "loss": 0.0978, "num_tokens": 620343097.0, "reward": 1.4285714626312256, "reward_std": 0.3785489499568939, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8928571343421936, "rewards/tag_count_reward/std": 0.2584463059902191, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 946.6004638671875, "completions/mean_terminated_length": 769.6917114257812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.216077992648234, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1465146642191623, "kl": 0.02099609375, "learning_rate": 9.637818586632957e-07, "loss": 0.1322, "num_tokens": 620836006.0, "reward": 1.4140626192092896, "reward_std": 0.33451762795448303, "rewards/accuracy_reward/mean": 0.5208333134651184, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.22998054325580597, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1085.125, "completions/mean_terminated_length": 819.0313110351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21629108731553992, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12907170009071153, "kl": 0.018890380859375, "learning_rate": 9.636501629567153e-07, "loss": 0.1305, "num_tokens": 621388958.0, "reward": 1.4029018878936768, "reward_std": 0.35832539200782776, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22654591500759125, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 987.4598388671875, "completions/mean_terminated_length": 835.9540405273438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.21650418198284588, "frac_reward_zero_std": 0.0, "grad_norm": 0.13506395520538472, "kl": 0.02105712890625, "learning_rate": 9.635182383264322e-07, "loss": 0.0484, "num_tokens": 621908268.0, "reward": 1.4944196939468384, "reward_std": 0.4193144738674164, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.20802249014377594, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 904.10498046875, "completions/mean_terminated_length": 740.6912841796875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.21671727665015184, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12805935434893032, "kl": 0.021331787109375, "learning_rate": 9.63386084845456e-07, "loss": 0.0827, "num_tokens": 622373787.0, "reward": 1.6255581378936768, "reward_std": 0.34637704491615295, "rewards/accuracy_reward/mean": 0.6941964030265808, "rewards/accuracy_reward/std": 0.461262047290802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.1990012526512146, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1033.5670166015625, "completions/mean_terminated_length": 852.036865234375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2169303713174578, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11895942802725464, "kl": 0.0186767578125, "learning_rate": 9.63253702586924e-07, "loss": 0.0893, "num_tokens": 622907609.0, "reward": 1.3844866752624512, "reward_std": 0.36421677470207214, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.19114895164966583, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 979.2232666015625, "completions/mean_terminated_length": 820.2769775390625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21714346598476372, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13699796921985968, "kl": 0.017364501953125, "learning_rate": 9.631210916240995e-07, "loss": 0.0851, "num_tokens": 623410413.0, "reward": 1.5195313692092896, "reward_std": 0.3155477046966553, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1752651482820511, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 950.982177734375, "completions/mean_terminated_length": 751.26123046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21735656065206968, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1450168782326325, "kl": 0.02294921875, "learning_rate": 9.629882520303726e-07, "loss": 0.0752, "num_tokens": 623907637.0, "reward": 1.4492188692092896, "reward_std": 0.2938949167728424, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.20309330523014069, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 925.60498046875, "completions/mean_terminated_length": 752.0386352539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21756965531937564, "frac_reward_zero_std": 0.0, "grad_norm": 0.15304126104011737, "kl": 0.022705078125, "learning_rate": 9.628551838792597e-07, "loss": 0.0876, "num_tokens": 624391700.0, "reward": 1.555803656578064, "reward_std": 0.3249172568321228, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.16709057986736298, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1062.575927734375, "completions/mean_terminated_length": 779.4080200195312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2177827499866816, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12268904468243151, "kl": 0.0164794921875, "learning_rate": 9.627218872444037e-07, "loss": 0.0526, "num_tokens": 624942694.0, "reward": 1.3794643878936768, "reward_std": 0.35217374563217163, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9129464030265808, "rewards/tag_count_reward/std": 0.23698757588863373, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1002.9397583007812, "completions/mean_terminated_length": 859.7081298828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21799584465398752, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12139979369800806, "kl": 0.019012451171875, "learning_rate": 9.625883621995743e-07, "loss": 0.033, "num_tokens": 625460747.0, "reward": 1.5256696939468384, "reward_std": 0.3746279180049896, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.1673406958580017, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1001.4732666015625, "completions/mean_terminated_length": 810.9446411132812, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.21820893932129348, "frac_reward_zero_std": 0.0, "grad_norm": 0.13689314803119318, "kl": 0.0206298828125, "learning_rate": 9.624546088186677e-07, "loss": 0.0904, "num_tokens": 625979487.0, "reward": 1.4587054252624512, "reward_std": 0.36251774430274963, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2177339345216751, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1037.290283203125, "completions/mean_terminated_length": 817.5706787109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.21842203398859944, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12281669816573315, "kl": 0.0189208984375, "learning_rate": 9.623206271757056e-07, "loss": 0.1135, "num_tokens": 626518673.0, "reward": 1.3895089626312256, "reward_std": 0.3988742232322693, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9095982313156128, "rewards/tag_count_reward/std": 0.24331659078598022, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 966.7991333007812, "completions/mean_terminated_length": 827.9042358398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2186351286559054, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12982617643749036, "kl": 0.020721435546875, "learning_rate": 9.621864173448367e-07, "loss": 0.0766, "num_tokens": 627016791.0, "reward": 1.4626116752624512, "reward_std": 0.34920039772987366, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.20491690933704376, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1091.966552734375, "completions/mean_terminated_length": 861.5650634765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.21884822332321133, "frac_reward_zero_std": 0.0, "grad_norm": 0.12834424542553474, "kl": 0.01690673828125, "learning_rate": 9.620519794003362e-07, "loss": 0.1199, "num_tokens": 627576568.0, "reward": 1.364397406578064, "reward_std": 0.33898791670799255, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9045758843421936, "rewards/tag_count_reward/std": 0.23102784156799316, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 973.0089721679688, "completions/mean_terminated_length": 809.9639892578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21906131799051728, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3401349518761631, "kl": 0.0272216796875, "learning_rate": 9.61917313416605e-07, "loss": 0.0819, "num_tokens": 628073740.0, "reward": 1.5524554252624512, "reward_std": 0.33791205286979675, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.190138041973114, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1132.2723388671875, "completions/mean_terminated_length": 892.37744140625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.21927441265782324, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1319965470257071, "kl": 0.018829345703125, "learning_rate": 9.617824194681703e-07, "loss": 0.1119, "num_tokens": 628648278.0, "reward": 1.4497768878936768, "reward_std": 0.35723069310188293, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24061305820941925, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 937.6428833007812, "completions/mean_terminated_length": 745.801025390625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2194875073251292, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14290440942335067, "kl": 0.020538330078125, "learning_rate": 9.616472976296855e-07, "loss": 0.13, "num_tokens": 629138518.0, "reward": 1.5251116752624512, "reward_std": 0.4350827634334564, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.2255041003227234, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1029.5491943359375, "completions/mean_terminated_length": 828.0374755859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.21970060199243513, "frac_reward_zero_std": 0.0, "grad_norm": 0.13104418071881732, "kl": 0.019287109375, "learning_rate": 9.615119479759307e-07, "loss": 0.0817, "num_tokens": 629665356.0, "reward": 1.458147406578064, "reward_std": 0.3374355137348175, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21699519455432892, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1018.15185546875, "completions/mean_terminated_length": 864.994873046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2199136966597411, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13179287190678912, "kl": 0.019989013671875, "learning_rate": 9.61376370581811e-07, "loss": 0.1198, "num_tokens": 630189312.0, "reward": 1.5625001192092896, "reward_std": 0.4315028488636017, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22216394543647766, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1005.3795166015625, "completions/mean_terminated_length": 788.9865112304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22012679132704704, "frac_reward_zero_std": 0.0, "grad_norm": 0.13303082434711633, "kl": 0.017608642578125, "learning_rate": 9.612405655223585e-07, "loss": 0.0692, "num_tokens": 630704490.0, "reward": 1.4062501192092896, "reward_std": 0.3164728283882141, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.208564892411232, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1026.997802734375, "completions/mean_terminated_length": 875.1564331054688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.220339885994353, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12440418972766916, "kl": 0.0181884765625, "learning_rate": 9.611045328727306e-07, "loss": 0.0667, "num_tokens": 631235273.0, "reward": 1.5072544813156128, "reward_std": 0.41397711634635925, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.2241603583097458, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1221.544677734375, "completions/mean_terminated_length": 962.2169799804688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.22055298066165893, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10698898828875082, "kl": 0.014739990234375, "learning_rate": 9.609682727082115e-07, "loss": 0.0715, "num_tokens": 631861277.0, "reward": 1.380022406578064, "reward_std": 0.4156619608402252, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.24660564959049225, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1229.2679443359375, "completions/mean_terminated_length": 922.8711547851562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2207660753289649, "frac_reward_zero_std": 0.0, "grad_norm": 0.12908320574076765, "kl": 0.0152130126953125, "learning_rate": 9.6083178510421e-07, "loss": 0.1468, "num_tokens": 632487733.0, "reward": 1.28515625, "reward_std": 0.41717156767845154, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8766741156578064, "rewards/tag_count_reward/std": 0.27881479263305664, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1030.4710693359375, "completions/mean_terminated_length": 822.5886840820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22097916999627085, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1318622567323445, "kl": 0.019256591796875, "learning_rate": 9.60695070136262e-07, "loss": 0.1062, "num_tokens": 633018600.0, "reward": 1.387834906578064, "reward_std": 0.3541650176048279, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8722098469734192, "rewards/tag_count_reward/std": 0.2633313834667206, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1059.2567138671875, "completions/mean_terminated_length": 837.7349243164062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2211922646635768, "frac_reward_zero_std": 0.0, "grad_norm": 0.13777712246161447, "kl": 0.018157958984375, "learning_rate": 9.60558127880029e-07, "loss": 0.111, "num_tokens": 633566379.0, "reward": 1.3883929252624512, "reward_std": 0.4226250946521759, "rewards/accuracy_reward/mean": 0.5208333134651184, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.23324955999851227, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1124.8192138671875, "completions/mean_terminated_length": 845.718017578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22140535933088273, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10934617098519027, "kl": 0.017120361328125, "learning_rate": 9.604209584112975e-07, "loss": 0.051, "num_tokens": 634141050.0, "reward": 1.4654018878936768, "reward_std": 0.37578099966049194, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.23359987139701843, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1105.134033203125, "completions/mean_terminated_length": 844.56982421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2216184539981887, "frac_reward_zero_std": 0.0, "grad_norm": 0.1316632638977043, "kl": 0.020965576171875, "learning_rate": 9.602835618059808e-07, "loss": 0.1475, "num_tokens": 634697958.0, "reward": 1.5033482313156128, "reward_std": 0.4102972745895386, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8582589030265808, "rewards/tag_count_reward/std": 0.26598531007766724, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1044.60498046875, "completions/mean_terminated_length": 826.4755859375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.22183154866549465, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.2152842543609118, "kl": 0.0208740234375, "learning_rate": 9.601459381401167e-07, "loss": 0.082, "num_tokens": 635238421.0, "reward": 1.430803656578064, "reward_std": 0.44480612874031067, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8638392686843872, "rewards/tag_count_reward/std": 0.25665283203125, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1017.8348388671875, "completions/mean_terminated_length": 823.8248901367188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2220446433328006, "frac_reward_zero_std": 0.0, "grad_norm": 0.13736390359173417, "kl": 0.019500732421875, "learning_rate": 9.600080874898702e-07, "loss": 0.0848, "num_tokens": 635761307.0, "reward": 1.4090402126312256, "reward_std": 0.3900696337223053, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.26011165976524353, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1174.328125, "completions/mean_terminated_length": 865.507568359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22225773800010654, "frac_reward_zero_std": 0.0, "grad_norm": 0.12768329049189028, "kl": 0.0167083740234375, "learning_rate": 9.598700099315307e-07, "loss": 0.1312, "num_tokens": 636360830.0, "reward": 1.2488839626312256, "reward_std": 0.44803357124328613, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8426339030265808, "rewards/tag_count_reward/std": 0.29785096645355225, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1126.6160888671875, "completions/mean_terminated_length": 923.2588500976562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2224708326674125, "frac_reward_zero_std": 0.0, "grad_norm": 0.13348343720204275, "kl": 0.0168609619140625, "learning_rate": 9.597317055415135e-07, "loss": 0.0715, "num_tokens": 636928962.0, "reward": 1.4481027126312256, "reward_std": 0.4428902864456177, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8833705186843872, "rewards/tag_count_reward/std": 0.24562665820121765, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1141.5357666015625, "completions/mean_terminated_length": 867.4883422851562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.22268392733471845, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1292659598688964, "kl": 0.0169525146484375, "learning_rate": 9.595931743963596e-07, "loss": 0.0929, "num_tokens": 637513346.0, "reward": 1.3716518878936768, "reward_std": 0.37760260701179504, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.23209868371486664, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1021.0870971679688, "completions/mean_terminated_length": 859.2222290039062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2228970220020244, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1386405943006892, "kl": 0.01806640625, "learning_rate": 9.594544165727354e-07, "loss": 0.1175, "num_tokens": 638042633.0, "reward": 1.5345982313156128, "reward_std": 0.3629516363143921, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348817229270935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2197883427143097, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1077.38623046875, "completions/mean_terminated_length": 859.9262084960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.22311011666933034, "frac_reward_zero_std": 0.0, "grad_norm": 0.12466369630807479, "kl": 0.018646240234375, "learning_rate": 9.593154321474326e-07, "loss": 0.0991, "num_tokens": 638593014.0, "reward": 1.434709906578064, "reward_std": 0.406714528799057, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8900669813156128, "rewards/tag_count_reward/std": 0.2509413957595825, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 945.8303833007812, "completions/mean_terminated_length": 752.010498046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2233232113366363, "frac_reward_zero_std": 0.0, "grad_norm": 0.15353207489639883, "kl": 0.020294189453125, "learning_rate": 9.591762211973687e-07, "loss": 0.1243, "num_tokens": 639073546.0, "reward": 1.5334821939468384, "reward_std": 0.41594818234443665, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8816964030265808, "rewards/tag_count_reward/std": 0.2578950524330139, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1115.622802734375, "completions/mean_terminated_length": 847.6982421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22353630600394225, "frac_reward_zero_std": 0.0, "grad_norm": 0.16457073235828965, "kl": 0.017852783203125, "learning_rate": 9.59036783799586e-07, "loss": 0.1101, "num_tokens": 639645329.0, "reward": 1.3113839626312256, "reward_std": 0.43677186965942383, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8582589030265808, "rewards/tag_count_reward/std": 0.2857520282268524, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1122.734375, "completions/mean_terminated_length": 839.4898071289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2237494006712482, "frac_reward_zero_std": 0.0, "grad_norm": 0.11942225933021121, "kl": 0.017852783203125, "learning_rate": 9.588971200312525e-07, "loss": 0.089, "num_tokens": 640215850.0, "reward": 1.3777902126312256, "reward_std": 0.4159053564071655, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8978794813156128, "rewards/tag_count_reward/std": 0.24412783980369568, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1123.96875, "completions/mean_terminated_length": 837.5731201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22396249533855414, "frac_reward_zero_std": 0.0, "grad_norm": 0.12926572051627483, "kl": 0.018280029296875, "learning_rate": 9.587572299696617e-07, "loss": 0.1074, "num_tokens": 640785036.0, "reward": 1.3359376192092896, "reward_std": 0.41222265362739563, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8761160969734192, "rewards/tag_count_reward/std": 0.26227980852127075, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1012.5469360351562, "completions/mean_terminated_length": 879.5289306640625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2241755900058601, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12155619563747203, "kl": 0.01824951171875, "learning_rate": 9.586171136922315e-07, "loss": 0.0569, "num_tokens": 641308817.0, "reward": 1.4224331378936768, "reward_std": 0.3785761594772339, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.21372579038143158, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1213.529052734375, "completions/mean_terminated_length": 911.6990966796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22438868467316606, "frac_reward_zero_std": 0.0, "grad_norm": 0.12549264496472973, "kl": 0.01580810546875, "learning_rate": 9.58476771276506e-07, "loss": 0.0889, "num_tokens": 641924478.0, "reward": 1.3889509439468384, "reward_std": 0.4436909258365631, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9001116156578064, "rewards/tag_count_reward/std": 0.2439078539609909, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1090.49560546875, "completions/mean_terminated_length": 888.6432495117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.224601779340472, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12344296556345123, "kl": 0.01678466796875, "learning_rate": 9.583362028001537e-07, "loss": 0.069, "num_tokens": 642486396.0, "reward": 1.3443081378936768, "reward_std": 0.2917519807815552, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.19568641483783722, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1027.388427734375, "completions/mean_terminated_length": 896.2770385742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22481487400777794, "frac_reward_zero_std": 0.0, "grad_norm": 0.13365963009169202, "kl": 0.0179290771484375, "learning_rate": 9.58195408340969e-07, "loss": 0.0537, "num_tokens": 643015610.0, "reward": 1.5106027126312256, "reward_std": 0.3467734754085541, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19157297909259796, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1030.88623046875, "completions/mean_terminated_length": 778.7326049804688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2250279686750839, "frac_reward_zero_std": 0.0, "grad_norm": 0.1379582608634983, "kl": 0.01904296875, "learning_rate": 9.580543879768702e-07, "loss": 0.1071, "num_tokens": 643550487.0, "reward": 1.5039063692092896, "reward_std": 0.4127042889595032, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8722098469734192, "rewards/tag_count_reward/std": 0.2767925560474396, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 982.2857666015625, "completions/mean_terminated_length": 791.5789794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22524106334238986, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13965565352135834, "kl": 0.019622802734375, "learning_rate": 9.579131417859016e-07, "loss": 0.094, "num_tokens": 644057207.0, "reward": 1.4810268878936768, "reward_std": 0.37192073464393616, "rewards/accuracy_reward/mean": 0.5740740895271301, "rewards/accuracy_reward/std": 0.4950558841228485, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.2109544575214386, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 935.9688110351562, "completions/mean_terminated_length": 802.5249633789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22545415800969582, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1324908828382511, "kl": 0.019805908203125, "learning_rate": 9.57771669846232e-07, "loss": 0.0627, "num_tokens": 644542793.0, "reward": 1.5463169813156128, "reward_std": 0.4019099175930023, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.2158125787973404, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1066.9754638671875, "completions/mean_terminated_length": 843.8931884765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22566725267700175, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1147936390042102, "kl": 0.018341064453125, "learning_rate": 9.576299722361556e-07, "loss": 0.0757, "num_tokens": 645086062.0, "reward": 1.454241156578064, "reward_std": 0.3843628168106079, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.21683764457702637, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1111.2857666015625, "completions/mean_terminated_length": 838.6397705078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2258803473443077, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11575035667430436, "kl": 0.0166015625, "learning_rate": 9.574880490340908e-07, "loss": 0.0576, "num_tokens": 645650014.0, "reward": 1.3945313692092896, "reward_std": 0.33647313714027405, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.19151432812213898, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1125.1273193359375, "completions/mean_terminated_length": 893.1200561523438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.22609344201161366, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11575299408640558, "kl": 0.017730712890625, "learning_rate": 9.573459003185816e-07, "loss": 0.0793, "num_tokens": 646223431.0, "reward": 1.3593751192092896, "reward_std": 0.3176526427268982, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.21143031120300293, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 972.654052734375, "completions/mean_terminated_length": 763.3200073242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22630653667891962, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11779861628180471, "kl": 0.017669677734375, "learning_rate": 9.572035261682961e-07, "loss": 0.0498, "num_tokens": 646727868.0, "reward": 1.3733259439468384, "reward_std": 0.32931098341941833, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.171964630484581, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1058.65185546875, "completions/mean_terminated_length": 823.61328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.22651963134622555, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12909247656160433, "kl": 0.0170135498046875, "learning_rate": 9.570609266620277e-07, "loss": 0.0931, "num_tokens": 647266416.0, "reward": 1.4224331378936768, "reward_std": 0.3048005998134613, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.19962133467197418, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1001.138427734375, "completions/mean_terminated_length": 813.8052978515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2267327260135315, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.20340384321624796, "kl": 0.020965576171875, "learning_rate": 9.569181018786942e-07, "loss": 0.1119, "num_tokens": 647777630.0, "reward": 1.5318081378936768, "reward_std": 0.3819688856601715, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.24489878118038177, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1050.638427734375, "completions/mean_terminated_length": 792.8932495117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22694582068083746, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12182738191951983, "kl": 0.01837158203125, "learning_rate": 9.567750518973384e-07, "loss": 0.0405, "num_tokens": 648324412.0, "reward": 1.352678656578064, "reward_std": 0.3457717299461365, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.21386082470417023, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1005.5870971679688, "completions/mean_terminated_length": 785.8351440429688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.22715891534814342, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1338490146706846, "kl": 0.01898193359375, "learning_rate": 9.566317767971272e-07, "loss": 0.1153, "num_tokens": 648843107.0, "reward": 1.4598214626312256, "reward_std": 0.3113630712032318, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21618320047855377, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1007.8795166015625, "completions/mean_terminated_length": 805.4026489257812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.22737201001544935, "frac_reward_zero_std": 0.0, "grad_norm": 0.12914294594114475, "kl": 0.019622802734375, "learning_rate": 9.564882766573525e-07, "loss": 0.0663, "num_tokens": 649360093.0, "reward": 1.4486607313156128, "reward_std": 0.36453840136528015, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.21438556909561157, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 973.4531860351562, "completions/mean_terminated_length": 787.7984619140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2275851046827553, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13327261206914826, "kl": 0.017669677734375, "learning_rate": 9.563445515574307e-07, "loss": 0.0752, "num_tokens": 649876968.0, "reward": 1.3398438692092896, "reward_std": 0.34925320744514465, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21955746412277222, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1050.54248046875, "completions/mean_terminated_length": 853.1845092773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22779819935006126, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1338623522910935, "kl": 0.0163421630859375, "learning_rate": 9.562006015769027e-07, "loss": 0.0753, "num_tokens": 650422219.0, "reward": 1.4235491752624512, "reward_std": 0.36818650364875793, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.22267401218414307, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 960.997802734375, "completions/mean_terminated_length": 812.0177612304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22801129401736722, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13236301406448922, "kl": 0.021240234375, "learning_rate": 9.560564267954338e-07, "loss": 0.0922, "num_tokens": 650920442.0, "reward": 1.6032366752624512, "reward_std": 0.37877747416496277, "rewards/accuracy_reward/mean": 0.6919642686843872, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.2232842743396759, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1049.1429443359375, "completions/mean_terminated_length": 851.508056640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22822438868467318, "frac_reward_zero_std": 0.0, "grad_norm": 0.1470817554146858, "kl": 0.017730712890625, "learning_rate": 9.559120272928135e-07, "loss": 0.0915, "num_tokens": 651461130.0, "reward": 1.3738839626312256, "reward_std": 0.34070566296577454, "rewards/accuracy_reward/mean": 0.46990740299224854, "rewards/accuracy_reward/std": 0.4996722936630249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.2131679803133011, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1104.49560546875, "completions/mean_terminated_length": 850.577880859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2284374833519791, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11615639755764579, "kl": 0.015960693359375, "learning_rate": 9.557674031489563e-07, "loss": 0.0711, "num_tokens": 652022536.0, "reward": 1.3426339626312256, "reward_std": 0.33037620782852173, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.21992461383342743, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1003.2366333007812, "completions/mean_terminated_length": 779.5609741210938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.22865057801928507, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12237143573072615, "kl": 0.019012451171875, "learning_rate": 9.556225544438998e-07, "loss": 0.0288, "num_tokens": 652540594.0, "reward": 1.4687501192092896, "reward_std": 0.3836686909198761, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.21600989997386932, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1162.3616943359375, "completions/mean_terminated_length": 897.9536743164062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.22886367268659102, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11945904960646864, "kl": 0.0159759521484375, "learning_rate": 9.554774812578078e-07, "loss": 0.0656, "num_tokens": 653130292.0, "reward": 1.32421875, "reward_std": 0.33299005031585693, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8934151530265808, "rewards/tag_count_reward/std": 0.2616334557533264, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1072.7098388671875, "completions/mean_terminated_length": 806.7216186523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22907676735389698, "frac_reward_zero_std": 0.0, "grad_norm": 0.12838378433575587, "kl": 0.018402099609375, "learning_rate": 9.55332183670966e-07, "loss": 0.0967, "num_tokens": 653678546.0, "reward": 1.4882813692092896, "reward_std": 0.3962567150592804, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.2347799688577652, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1059.3660888671875, "completions/mean_terminated_length": 882.4526977539062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.2292898620212029, "frac_reward_zero_std": 0.0, "grad_norm": 0.13060781021638646, "kl": 0.019561767578125, "learning_rate": 9.551866617637863e-07, "loss": 0.0806, "num_tokens": 654224070.0, "reward": 1.5502232313156128, "reward_std": 0.3903213143348694, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.21642272174358368, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 966.029052734375, "completions/mean_terminated_length": 788.9791870117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22950295668850887, "frac_reward_zero_std": 0.0, "grad_norm": 0.12991373348762655, "kl": 0.01898193359375, "learning_rate": 9.550409156168037e-07, "loss": 0.0577, "num_tokens": 654727139.0, "reward": 1.4726563692092896, "reward_std": 0.33379653096199036, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.22768032550811768, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1143.8125, "completions/mean_terminated_length": 870.4534912109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22971605135581483, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12163567022458427, "kl": 0.01654052734375, "learning_rate": 9.548949453106776e-07, "loss": 0.0917, "num_tokens": 655315711.0, "reward": 1.3504464626312256, "reward_std": 0.3578961491584778, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.20881615579128265, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1000.560302734375, "completions/mean_terminated_length": 786.5671997070312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.22992914602312078, "frac_reward_zero_std": 0.0, "grad_norm": 0.13204220851346132, "kl": 0.017852783203125, "learning_rate": 9.547487509261913e-07, "loss": 0.1097, "num_tokens": 655841770.0, "reward": 1.4196429252624512, "reward_std": 0.37791186571121216, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.21985933184623718, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1030.2098388671875, "completions/mean_terminated_length": 818.9703369140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2301422406904267, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12427325355228985, "kl": 0.016754150390625, "learning_rate": 9.546023325442523e-07, "loss": 0.0804, "num_tokens": 656373208.0, "reward": 1.4157366752624512, "reward_std": 0.3472166359424591, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.16643086075782776, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 965.0089721679688, "completions/mean_terminated_length": 736.7026977539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23035533535773267, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14395707121049647, "kl": 0.019256591796875, "learning_rate": 9.544556902458919e-07, "loss": 0.086, "num_tokens": 656883660.0, "reward": 1.4888393878936768, "reward_std": 0.3880871832370758, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.194285050034523, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 944.482177734375, "completions/mean_terminated_length": 757.2010498046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.23056843002503863, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1435070377985579, "kl": 0.020538330078125, "learning_rate": 9.543088241122653e-07, "loss": 0.0696, "num_tokens": 657375604.0, "reward": 1.4665179252624512, "reward_std": 0.3584713339805603, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.19695264101028442, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1029.8973388671875, "completions/mean_terminated_length": 831.7066650390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2307815246923446, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1241564280155971, "kl": 0.01812744140625, "learning_rate": 9.541617342246518e-07, "loss": 0.0921, "num_tokens": 657906534.0, "reward": 1.4157366752624512, "reward_std": 0.3852246403694153, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21015624701976776, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 941.5826416015625, "completions/mean_terminated_length": 777.0385131835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.23099461935965052, "frac_reward_zero_std": 0.0, "grad_norm": 0.13440716688540302, "kl": 0.02044677734375, "learning_rate": 9.540144206644545e-07, "loss": 0.1254, "num_tokens": 658394827.0, "reward": 1.5239956378936768, "reward_std": 0.33290010690689087, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19590957462787628, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1082.9576416015625, "completions/mean_terminated_length": 856.9834594726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.23120771402695647, "frac_reward_zero_std": 0.0, "grad_norm": 0.17389404479686382, "kl": 0.02276611328125, "learning_rate": 9.538668835131996e-07, "loss": 0.0912, "num_tokens": 658950504.0, "reward": 1.4168527126312256, "reward_std": 0.3355737328529358, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.1893114298582077, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 980.72998046875, "completions/mean_terminated_length": 752.2357788085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23142080869426243, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12917280431621653, "kl": 0.01849365234375, "learning_rate": 9.537191228525382e-07, "loss": 0.0623, "num_tokens": 659454095.0, "reward": 1.4179688692092896, "reward_std": 0.3551293909549713, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.2071828842163086, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1012.7344360351562, "completions/mean_terminated_length": 722.8599853515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2316339033615684, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 6.093094122262371, "kl": 0.2044677734375, "learning_rate": 9.535711387642447e-07, "loss": 0.0984, "num_tokens": 659981704.0, "reward": 1.4921876192092896, "reward_std": 0.33223626017570496, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.2020309567451477, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 930.216552734375, "completions/mean_terminated_length": 737.0916137695312, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.23184699802887432, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12818750344132163, "kl": 0.019744873046875, "learning_rate": 9.534229313302163e-07, "loss": 0.0599, "num_tokens": 660469737.0, "reward": 1.4179688692092896, "reward_std": 0.39471009373664856, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21606117486953735, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 997.794677734375, "completions/mean_terminated_length": 800.0106201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23206009269618028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13105103456407677, "kl": 0.019744873046875, "learning_rate": 9.532745006324749e-07, "loss": 0.084, "num_tokens": 660987117.0, "reward": 1.5005581378936768, "reward_std": 0.3536769449710846, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17670518159866333, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 969.2031860351562, "completions/mean_terminated_length": 808.7666625976562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.23227318736348623, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14048518410823574, "kl": 0.019561767578125, "learning_rate": 9.531258467531656e-07, "loss": 0.0611, "num_tokens": 661496296.0, "reward": 1.5412946939468384, "reward_std": 0.3625069260597229, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.1981375813484192, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 952.49560546875, "completions/mean_terminated_length": 742.7180786132812, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2324862820307922, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13248315495935378, "kl": 0.020782470703125, "learning_rate": 9.529769697745566e-07, "loss": 0.108, "num_tokens": 661988518.0, "reward": 1.5725446939468384, "reward_std": 0.3769925534725189, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20643207430839539, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1059.69873046875, "completions/mean_terminated_length": 818.1138916015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23269937669809812, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11587636675394795, "kl": 0.018646240234375, "learning_rate": 9.5282786977904e-07, "loss": 0.0924, "num_tokens": 662530895.0, "reward": 1.4520089626312256, "reward_std": 0.3783363997936249, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22406357526779175, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 957.529052734375, "completions/mean_terminated_length": 795.3564453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23291247136540408, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1532105867200508, "kl": 0.021697998046875, "learning_rate": 9.526785468491315e-07, "loss": 0.0452, "num_tokens": 663030572.0, "reward": 1.4135044813156128, "reward_std": 0.36551082134246826, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21910780668258667, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1068.83935546875, "completions/mean_terminated_length": 875.1016235351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23312556603271004, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.48233333838817555, "kl": 0.0185546875, "learning_rate": 9.525290010674696e-07, "loss": 0.1089, "num_tokens": 663578948.0, "reward": 1.3811384439468384, "reward_std": 0.2970651686191559, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19375662505626678, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 981.6585083007812, "completions/mean_terminated_length": 763.8037719726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.233338660700016, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1976339788494781, "kl": 0.027923583984375, "learning_rate": 9.523792325168168e-07, "loss": 0.0726, "num_tokens": 664086027.0, "reward": 1.462053656578064, "reward_std": 0.3783024847507477, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.2176220864057541, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1017.1317138671875, "completions/mean_terminated_length": 809.8526000976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23355175536732192, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13112878736351327, "kl": 0.02001953125, "learning_rate": 9.522292412800582e-07, "loss": 0.0831, "num_tokens": 664609462.0, "reward": 1.3844866752624512, "reward_std": 0.27677157521247864, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18447525799274445, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1093.8795166015625, "completions/mean_terminated_length": 809.026123046875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.23376485003462788, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12394259257550978, "kl": 0.01763916015625, "learning_rate": 9.520790274402025e-07, "loss": 0.0929, "num_tokens": 665165360.0, "reward": 1.4693081378936768, "reward_std": 0.36182984709739685, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.1947399377822876, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1000.1250610351562, "completions/mean_terminated_length": 806.0740356445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23397794470193384, "frac_reward_zero_std": 0.0, "grad_norm": 0.137832899404671, "kl": 0.019287109375, "learning_rate": 9.519285910803816e-07, "loss": 0.0837, "num_tokens": 665679176.0, "reward": 1.5742188692092896, "reward_std": 0.3561047911643982, "rewards/accuracy_reward/mean": 0.6696428656578064, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2343062311410904, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1062.10498046875, "completions/mean_terminated_length": 882.6148071289062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2341910393692398, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16347228099305386, "kl": 0.0167236328125, "learning_rate": 9.517779322838506e-07, "loss": 0.0772, "num_tokens": 666230551.0, "reward": 1.4436384439468384, "reward_std": 0.380472332239151, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.18789489567279816, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1060.7701416015625, "completions/mean_terminated_length": 795.0849609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23440413403654572, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1272744486399644, "kl": 0.01922607421875, "learning_rate": 9.516270511339877e-07, "loss": 0.0368, "num_tokens": 666771168.0, "reward": 1.4564732313156128, "reward_std": 0.2963971495628357, "rewards/accuracy_reward/mean": 0.5509259104728699, "rewards/accuracy_reward/std": 0.49797651171684265, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.20531632006168365, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 950.7522583007812, "completions/mean_terminated_length": 723.0215454101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.23461722870385168, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15090076382920362, "kl": 0.02001953125, "learning_rate": 9.514759477142936e-07, "loss": 0.0994, "num_tokens": 667258497.0, "reward": 1.5496652126312256, "reward_std": 0.32379835844039917, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23282569646835327, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1077.5023193359375, "completions/mean_terminated_length": 816.3201293945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.23483032337115764, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12612378405274122, "kl": 0.018890380859375, "learning_rate": 9.513246221083932e-07, "loss": 0.1023, "num_tokens": 667812834.0, "reward": 1.2232143878936768, "reward_std": 0.3761574625968933, "rewards/accuracy_reward/mean": 0.3169642984867096, "rewards/accuracy_reward/std": 0.4658135175704956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22216394543647766, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1134.65625, "completions/mean_terminated_length": 895.3858642578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2350434180384636, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12488737895319843, "kl": 0.0155487060546875, "learning_rate": 9.51173074400033e-07, "loss": 0.1017, "num_tokens": 668393864.0, "reward": 1.3275669813156128, "reward_std": 0.36529234051704407, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.23073045909404755, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1240.8304443359375, "completions/mean_terminated_length": 910.8552856445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23525651270576953, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12567540433915395, "kl": 0.0176544189453125, "learning_rate": 9.510213046730833e-07, "loss": 0.1129, "num_tokens": 669021516.0, "reward": 1.3013393878936768, "reward_std": 0.39001351594924927, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8705357313156128, "rewards/tag_count_reward/std": 0.27978527545928955, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1148.2366943359375, "completions/mean_terminated_length": 925.1754760742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23546960737307548, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11805506609562257, "kl": 0.0167083740234375, "learning_rate": 9.508693130115372e-07, "loss": 0.0716, "num_tokens": 669611718.0, "reward": 1.4369419813156128, "reward_std": 0.43341225385665894, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9012276530265808, "rewards/tag_count_reward/std": 0.24264031648635864, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1048.5335693359375, "completions/mean_terminated_length": 896.9434204101562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.23568270204038144, "frac_reward_zero_std": 0.0, "grad_norm": 0.1370970954546268, "kl": 0.0194091796875, "learning_rate": 9.507170994995101e-07, "loss": 0.0947, "num_tokens": 670150869.0, "reward": 1.5195313692092896, "reward_std": 0.41213858127593994, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.22474436461925507, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1184.0670166015625, "completions/mean_terminated_length": 954.6610107421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2358957967076874, "frac_reward_zero_std": 0.0, "grad_norm": 0.11874730087574582, "kl": 0.0170745849609375, "learning_rate": 9.505646642212405e-07, "loss": 0.0998, "num_tokens": 670751667.0, "reward": 1.4447544813156128, "reward_std": 0.38637077808380127, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9045758843421936, "rewards/tag_count_reward/std": 0.2273675799369812, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1144.841552734375, "completions/mean_terminated_length": 875.2029418945312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.23610889137499333, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1161403916441395, "kl": 0.016937255859375, "learning_rate": 9.504120072610904e-07, "loss": 0.1007, "num_tokens": 671338172.0, "reward": 1.3364956378936768, "reward_std": 0.4597889482975006, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8833705186843872, "rewards/tag_count_reward/std": 0.2616143822669983, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 996.6585083007812, "completions/mean_terminated_length": 821.4349365234375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2363219860422993, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1214782776680158, "kl": 0.01959228515625, "learning_rate": 9.502591287035428e-07, "loss": 0.0707, "num_tokens": 671844355.0, "reward": 1.5931919813156128, "reward_std": 0.2933616638183594, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.17478010058403015, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 985.6428833007812, "completions/mean_terminated_length": 805.3472900390625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.23653508070960524, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2651676553697034, "kl": 0.026123046875, "learning_rate": 9.501060286332048e-07, "loss": 0.0579, "num_tokens": 672355939.0, "reward": 1.5083706378936768, "reward_std": 0.3285157084465027, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1929948478937149, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1115.01123046875, "completions/mean_terminated_length": 857.1766357421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2367481753769112, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12933324792104972, "kl": 0.01837158203125, "learning_rate": 9.499527071348056e-07, "loss": 0.0954, "num_tokens": 672923048.0, "reward": 1.3867188692092896, "reward_std": 0.38290297985076904, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8733258843421936, "rewards/tag_count_reward/std": 0.2468990832567215, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1080.263427734375, "completions/mean_terminated_length": 791.344970703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.23696127004421713, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12221739273146501, "kl": 0.016937255859375, "learning_rate": 9.497991642931966e-07, "loss": 0.0424, "num_tokens": 673475342.0, "reward": 1.3911831378936768, "reward_std": 0.359164834022522, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.20491689443588257, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1020.0245971679688, "completions/mean_terminated_length": 848.6953125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2371743647115231, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13417200527324016, "kl": 0.017822265625, "learning_rate": 9.496454001933522e-07, "loss": 0.0761, "num_tokens": 674002297.0, "reward": 1.4185268878936768, "reward_std": 0.3305349349975586, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.18429669737815857, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1176.296875, "completions/mean_terminated_length": 912.7587280273438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.23738745937882905, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10479471547803462, "kl": 0.0163116455078125, "learning_rate": 9.494914149203691e-07, "loss": 0.0311, "num_tokens": 674597518.0, "reward": 1.3035714626312256, "reward_std": 0.3514695465564728, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.2151644378900528, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1046.3951416015625, "completions/mean_terminated_length": 815.2554931640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.237600554046135, "frac_reward_zero_std": 0.0, "grad_norm": 0.13787646103822634, "kl": 0.018768310546875, "learning_rate": 9.493372085594664e-07, "loss": 0.1247, "num_tokens": 675132799.0, "reward": 1.2566964626312256, "reward_std": 0.3821662664413452, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8705357313156128, "rewards/tag_count_reward/std": 0.2519102394580841, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1071.8482666015625, "completions/mean_terminated_length": 843.2727661132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23781364871344093, "frac_reward_zero_std": 0.0, "grad_norm": 0.12289956267841243, "kl": 0.0166015625, "learning_rate": 9.491827811959852e-07, "loss": 0.0768, "num_tokens": 675679723.0, "reward": 1.4330357313156128, "reward_std": 0.4378242492675781, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2271430343389511, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 931.513427734375, "completions/mean_terminated_length": 755.5297241210938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2380267433807469, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12399650810657688, "kl": 0.0185546875, "learning_rate": 9.490281329153895e-07, "loss": 0.0511, "num_tokens": 676164289.0, "reward": 1.5452009439468384, "reward_std": 0.36436668038368225, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.1962151974439621, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1058.18310546875, "completions/mean_terminated_length": 777.404052734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.23823983804805285, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13645637945747782, "kl": 0.01806640625, "learning_rate": 9.488732638032653e-07, "loss": 0.0754, "num_tokens": 676702611.0, "reward": 1.3470982313156128, "reward_std": 0.3205311894416809, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.26109668612480164, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1110.9285888671875, "completions/mean_terminated_length": 862.1016845703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2384529327153588, "frac_reward_zero_std": 0.0, "grad_norm": 0.13171320101676276, "kl": 0.017578125, "learning_rate": 9.487181739453207e-07, "loss": 0.1547, "num_tokens": 677264259.0, "reward": 1.3258929252624512, "reward_std": 0.4249982237815857, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.25470954179763794, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1161.7723388671875, "completions/mean_terminated_length": 897.1884155273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23866602738266474, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1179876710063193, "kl": 0.01617431640625, "learning_rate": 9.485628634273861e-07, "loss": 0.0974, "num_tokens": 677861421.0, "reward": 1.3258929252624512, "reward_std": 0.3822091221809387, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8616071343421936, "rewards/tag_count_reward/std": 0.26722386479377747, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1164.03125, "completions/mean_terminated_length": 840.6279907226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2388791220499707, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258067203278698, "kl": 0.017181396484375, "learning_rate": 9.484073323354139e-07, "loss": 0.1062, "num_tokens": 678449467.0, "reward": 1.3593751192092896, "reward_std": 0.4407947063446045, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8683035969734192, "rewards/tag_count_reward/std": 0.2721400260925293, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1006.9375610351562, "completions/mean_terminated_length": 794.247314453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23909221671727665, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13871572209995917, "kl": 0.01922607421875, "learning_rate": 9.482515807554788e-07, "loss": 0.098, "num_tokens": 678970575.0, "reward": 1.3978794813156128, "reward_std": 0.33868899941444397, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8666294813156128, "rewards/tag_count_reward/std": 0.2567592263221741, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1040.94873046875, "completions/mean_terminated_length": 848.1090087890625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2393053113845826, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12808558757991312, "kl": 0.018035888671875, "learning_rate": 9.480956087737774e-07, "loss": 0.0444, "num_tokens": 679511032.0, "reward": 1.4966518878936768, "reward_std": 0.35735705494880676, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.2142428457736969, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1223.6785888671875, "completions/mean_terminated_length": 928.9212036132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23951840605188854, "frac_reward_zero_std": 0.0, "grad_norm": 0.12305518503553173, "kl": 0.01556396484375, "learning_rate": 9.479394164766281e-07, "loss": 0.1285, "num_tokens": 680134200.0, "reward": 1.356584906578064, "reward_std": 0.46816831827163696, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8543526530265808, "rewards/tag_count_reward/std": 0.2810535132884979, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1021.6652221679688, "completions/mean_terminated_length": 815.297607421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2397315007191945, "frac_reward_zero_std": 0.0, "grad_norm": 0.1374047914609163, "kl": 0.017791748046875, "learning_rate": 9.477830039504714e-07, "loss": 0.0918, "num_tokens": 680663634.0, "reward": 1.4910714626312256, "reward_std": 0.3470616936683655, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8928571343421936, "rewards/tag_count_reward/std": 0.23699811100959778, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1131.44873046875, "completions/mean_terminated_length": 854.3517456054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.23994459538650045, "frac_reward_zero_std": 0.0, "grad_norm": 0.129918987632575, "kl": 0.016326904296875, "learning_rate": 9.476263712818698e-07, "loss": 0.1077, "num_tokens": 681239659.0, "reward": 1.3431919813156128, "reward_std": 0.38743600249290466, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8989955186843872, "rewards/tag_count_reward/std": 0.2399759739637375, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1028.294677734375, "completions/mean_terminated_length": 786.0442504882812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2401576900538064, "frac_reward_zero_std": 0.0, "grad_norm": 0.14134344131680723, "kl": 0.018707275390625, "learning_rate": 9.474695185575072e-07, "loss": 0.1402, "num_tokens": 681765375.0, "reward": 1.5033482313156128, "reward_std": 0.41235625743865967, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.8917410969734192, "rewards/tag_count_reward/std": 0.24859534204006195, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1128.15625, "completions/mean_terminated_length": 863.8333129882812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.24037078472111234, "frac_reward_zero_std": 0.0, "grad_norm": 0.12919158868521846, "kl": 0.01788330078125, "learning_rate": 9.473124458641901e-07, "loss": 0.0785, "num_tokens": 682338389.0, "reward": 1.450334906578064, "reward_std": 0.43026217818260193, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8744419813156128, "rewards/tag_count_reward/std": 0.2747781574726105, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1157.46875, "completions/mean_terminated_length": 860.625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2405838793884183, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12335542332175008, "kl": 0.016571044921875, "learning_rate": 9.471551532888456e-07, "loss": 0.107, "num_tokens": 682917095.0, "reward": 1.4068081378936768, "reward_std": 0.3423016369342804, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8934151530265808, "rewards/tag_count_reward/std": 0.2512744665145874, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1014.9777221679688, "completions/mean_terminated_length": 830.12109375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.24079697405572426, "frac_reward_zero_std": 0.0, "grad_norm": 0.13719931551206793, "kl": 0.016845703125, "learning_rate": 9.469976409185235e-07, "loss": 0.0741, "num_tokens": 683450349.0, "reward": 1.3962054252624512, "reward_std": 0.34097760915756226, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.19511540234088898, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1255.7366943359375, "completions/mean_terminated_length": 945.7205200195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2410100687230302, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11326073734368769, "kl": 0.0157928466796875, "learning_rate": 9.468399088403948e-07, "loss": 0.0575, "num_tokens": 684083959.0, "reward": 1.2410714626312256, "reward_std": 0.3917922377586365, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8683035969734192, "rewards/tag_count_reward/std": 0.2736770510673523, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1124.88623046875, "completions/mean_terminated_length": 908.7300415039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24122316339033617, "frac_reward_zero_std": 0.0, "grad_norm": 0.126178465595855, "kl": 0.017486572265625, "learning_rate": 9.466819571417519e-07, "loss": 0.1059, "num_tokens": 684655844.0, "reward": 1.4419643878936768, "reward_std": 0.3981620967388153, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.2214886099100113, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1139.59375, "completions/mean_terminated_length": 908.0392456054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2414362580576421, "frac_reward_zero_std": 0.0, "grad_norm": 0.12362567164885911, "kl": 0.020111083984375, "learning_rate": 9.465237859100093e-07, "loss": 0.0984, "num_tokens": 685228798.0, "reward": 1.3883929252624512, "reward_std": 0.3500884771347046, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8683035969734192, "rewards/tag_count_reward/std": 0.26799824833869934, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1151.28125, "completions/mean_terminated_length": 909.9546508789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24164935272494806, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11896610908952401, "kl": 0.01885986328125, "learning_rate": 9.463653952327024e-07, "loss": 0.066, "num_tokens": 685814556.0, "reward": 1.4274554252624512, "reward_std": 0.3754560053348541, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9029017686843872, "rewards/tag_count_reward/std": 0.2412969470024109, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1129.74560546875, "completions/mean_terminated_length": 845.140380859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24186244739225402, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12633658538528192, "kl": 0.015045166015625, "learning_rate": 9.462067851974886e-07, "loss": 0.1229, "num_tokens": 686396874.0, "reward": 1.4157366752624512, "reward_std": 0.3558761477470398, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23101703822612762, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1116.212158203125, "completions/mean_terminated_length": 862.0880737304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24207554205955997, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11538481927029474, "kl": 0.017913818359375, "learning_rate": 9.460479558921459e-07, "loss": 0.0911, "num_tokens": 686966553.0, "reward": 1.3593751192092896, "reward_std": 0.4412151873111725, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.24347303807735443, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1131.2410888671875, "completions/mean_terminated_length": 871.186279296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2422886367268659, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12929923824805095, "kl": 0.016876220703125, "learning_rate": 9.458889074045747e-07, "loss": 0.0747, "num_tokens": 687538645.0, "reward": 1.4843751192092896, "reward_std": 0.37626728415489197, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9084821343421936, "rewards/tag_count_reward/std": 0.2329067587852478, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1048.044677734375, "completions/mean_terminated_length": 843.752685546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24250173139417186, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12517031114912322, "kl": 0.019561767578125, "learning_rate": 9.45729639822796e-07, "loss": 0.0793, "num_tokens": 688073977.0, "reward": 1.493303656578064, "reward_std": 0.3122672140598297, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19207492470741272, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1004.9219360351562, "completions/mean_terminated_length": 818.2658081054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24271482606147782, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13755099713409896, "kl": 0.020721435546875, "learning_rate": 9.455701532349522e-07, "loss": 0.1012, "num_tokens": 688594166.0, "reward": 1.419084906578064, "reward_std": 0.3213981091976166, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.20244067907333374, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1075.46435546875, "completions/mean_terminated_length": 820.6873168945312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.24292792072878377, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13788831577203978, "kl": 0.02099609375, "learning_rate": 9.454104477293068e-07, "loss": 0.1142, "num_tokens": 689141958.0, "reward": 1.3989956378936768, "reward_std": 0.37277212738990784, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8833705186843872, "rewards/tag_count_reward/std": 0.2545716464519501, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 970.4888916015625, "completions/mean_terminated_length": 781.0052490234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2431410153960897, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1361819506275324, "kl": 0.019134521484375, "learning_rate": 9.452505233942447e-07, "loss": 0.078, "num_tokens": 689642689.0, "reward": 1.4754464626312256, "reward_std": 0.3517928421497345, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.19571910798549652, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1057.419677734375, "completions/mean_terminated_length": 848.5946044921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.24335411006339566, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11490567076191115, "kl": 0.0169677734375, "learning_rate": 9.450903803182717e-07, "loss": 0.1029, "num_tokens": 690188397.0, "reward": 1.3247768878936768, "reward_std": 0.43005073070526123, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.25505974888801575, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1085.4576416015625, "completions/mean_terminated_length": 853.4874877929688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.24356720473070162, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11883591467647857, "kl": 0.018524169921875, "learning_rate": 9.449300185900149e-07, "loss": 0.1034, "num_tokens": 690741882.0, "reward": 1.4029018878936768, "reward_std": 0.36611419916152954, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.2140796184539795, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1023.9866333007812, "completions/mean_terminated_length": 821.3743286132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24378029939800758, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11420225374750567, "kl": 0.018218994140625, "learning_rate": 9.447694382982221e-07, "loss": 0.0751, "num_tokens": 691267556.0, "reward": 1.3856027126312256, "reward_std": 0.32527297735214233, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18088066577911377, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1120.200927734375, "completions/mean_terminated_length": 877.1436157226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2439933940653135, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12201012108747257, "kl": 0.0174407958984375, "learning_rate": 9.446086395317622e-07, "loss": 0.0887, "num_tokens": 691833566.0, "reward": 1.3844866752624512, "reward_std": 0.33895665407180786, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.23308825492858887, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1091.88623046875, "completions/mean_terminated_length": 844.8005981445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24420648873261946, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1170179622713891, "kl": 0.017425537109375, "learning_rate": 9.44447622379625e-07, "loss": 0.0709, "num_tokens": 692395227.0, "reward": 1.4402902126312256, "reward_std": 0.41064462065696716, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.2317454218864441, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1149.8035888671875, "completions/mean_terminated_length": 850.40478515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24441958339992542, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12010893928556403, "kl": 0.0166015625, "learning_rate": 9.442863869309213e-07, "loss": 0.0646, "num_tokens": 692982195.0, "reward": 1.3816964626312256, "reward_std": 0.34269511699676514, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.2228260338306427, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1135.4888916015625, "completions/mean_terminated_length": 886.6221923828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.24463267806723138, "frac_reward_zero_std": 0.0, "grad_norm": 0.6000192959715754, "kl": 0.05126953125, "learning_rate": 9.441249332748824e-07, "loss": 0.0663, "num_tokens": 693564446.0, "reward": 1.297991156578064, "reward_std": 0.3227459192276001, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.22814933955669403, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1029.790283203125, "completions/mean_terminated_length": 805.0626831054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2448457727345373, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14467243051835282, "kl": 0.01776123046875, "learning_rate": 9.439632615008604e-07, "loss": 0.1248, "num_tokens": 694096192.0, "reward": 1.3493304252624512, "reward_std": 0.3884987235069275, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.215381920337677, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1156.2835693359375, "completions/mean_terminated_length": 913.0880737304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24505886740184327, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1319405199713387, "kl": 0.01885986328125, "learning_rate": 9.438013716983289e-07, "loss": 0.0482, "num_tokens": 694685551.0, "reward": 1.2751116752624512, "reward_std": 0.31994858384132385, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.21274222433567047, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1050.013427734375, "completions/mean_terminated_length": 823.073974609375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.24527196206914922, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12958232570022526, "kl": 0.01776123046875, "learning_rate": 9.436392639568809e-07, "loss": 0.0861, "num_tokens": 695218021.0, "reward": 1.3945313692092896, "reward_std": 0.3540402948856354, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.23448731005191803, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1000.2991333007812, "completions/mean_terminated_length": 822.4908447265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24548505673645518, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12241726943639346, "kl": 0.01922607421875, "learning_rate": 9.434769383662307e-07, "loss": 0.0279, "num_tokens": 695735803.0, "reward": 1.4709821939468384, "reward_std": 0.3499070107936859, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1687558889389038, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 959.93310546875, "completions/mean_terminated_length": 775.274169921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2456981514037611, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14022076688205312, "kl": 0.019073486328125, "learning_rate": 9.433143950162134e-07, "loss": 0.116, "num_tokens": 696236109.0, "reward": 1.4748884439468384, "reward_std": 0.3407088816165924, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.19655847549438477, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 949.872802734375, "completions/mean_terminated_length": 753.3658447265625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.24591124607106707, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13397138516990292, "kl": 0.02117919921875, "learning_rate": 9.43151633996784e-07, "loss": 0.0716, "num_tokens": 696732244.0, "reward": 1.4626116752624512, "reward_std": 0.3569982349872589, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21863439679145813, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 960.8281860351562, "completions/mean_terminated_length": 752.646240234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24612434073837303, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1448121986343773, "kl": 0.020355224609375, "learning_rate": 9.429886553980184e-07, "loss": 0.1232, "num_tokens": 697226647.0, "reward": 1.4508929252624512, "reward_std": 0.3497101366519928, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2338162064552307, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1034.0023193359375, "completions/mean_terminated_length": 820.2405395507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24633743540567898, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1299296540948816, "kl": 0.016815185546875, "learning_rate": 9.428254593101128e-07, "loss": 0.0688, "num_tokens": 697761992.0, "reward": 1.380022406578064, "reward_std": 0.3787865936756134, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.22577518224716187, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 959.33935546875, "completions/mean_terminated_length": 777.8958740234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2465505300729849, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1420988994117055, "kl": 0.019744873046875, "learning_rate": 9.426620458233837e-07, "loss": 0.1231, "num_tokens": 698259984.0, "reward": 1.5429688692092896, "reward_std": 0.2873460054397583, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.2086479365825653, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 922.6339721679688, "completions/mean_terminated_length": 745.2506713867188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.24676362474029087, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1277125771130333, "kl": 0.02081298828125, "learning_rate": 9.424984150282679e-07, "loss": 0.0683, "num_tokens": 698740332.0, "reward": 1.5145089626312256, "reward_std": 0.28034472465515137, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18940123915672302, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1071.80810546875, "completions/mean_terminated_length": 849.82470703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24697671940759683, "frac_reward_zero_std": 0.0, "grad_norm": 0.13903483653511328, "kl": 0.017578125, "learning_rate": 9.423345670153225e-07, "loss": 0.1148, "num_tokens": 699294918.0, "reward": 1.340959906578064, "reward_std": 0.37994667887687683, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22031240165233612, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1073.544677734375, "completions/mean_terminated_length": 797.1232299804688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2471898140749028, "frac_reward_zero_std": 0.0, "grad_norm": 0.1331404826986215, "kl": 0.017822265625, "learning_rate": 9.421705018752252e-07, "loss": 0.1375, "num_tokens": 699842362.0, "reward": 1.4001116752624512, "reward_std": 0.4011727273464203, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21589355170726776, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1078.279052734375, "completions/mean_terminated_length": 844.5789184570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24740290874220872, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12522523995887483, "kl": 0.020538330078125, "learning_rate": 9.420062196987729e-07, "loss": 0.0608, "num_tokens": 700396551.0, "reward": 1.4927456378936768, "reward_std": 0.35562044382095337, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960493743419647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1941235363483429, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1087.7366943359375, "completions/mean_terminated_length": 846.32958984375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.24761600340951467, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.17851690284384417, "kl": 0.024688720703125, "learning_rate": 9.418417205768836e-07, "loss": 0.1213, "num_tokens": 700944913.0, "reward": 1.2751116752624512, "reward_std": 0.3589559495449066, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8978794813156128, "rewards/tag_count_reward/std": 0.25587278604507446, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1046.8460693359375, "completions/mean_terminated_length": 815.8104858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24782909807682063, "frac_reward_zero_std": 0.0, "grad_norm": 0.39339112525832093, "kl": 0.0181884765625, "learning_rate": 9.41677004600595e-07, "loss": 0.1299, "num_tokens": 701480140.0, "reward": 1.5295759439468384, "reward_std": 0.350629061460495, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.21685130894184113, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 988.91748046875, "completions/mean_terminated_length": 740.9228515625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2480421927441266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13564337694287523, "kl": 0.019378662109375, "learning_rate": 9.415120718610646e-07, "loss": 0.082, "num_tokens": 701989143.0, "reward": 1.4637277126312256, "reward_std": 0.3330329656600952, "rewards/accuracy_reward/mean": 0.5717592835426331, "rewards/accuracy_reward/std": 0.49539753794670105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9123883843421936, "rewards/tag_count_reward/std": 0.2280583381652832, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1087.796875, "completions/mean_terminated_length": 862.9559326171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24825528741143252, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11561748260031024, "kl": 0.018585205078125, "learning_rate": 9.413469224495701e-07, "loss": 0.0907, "num_tokens": 702544012.0, "reward": 1.4280134439468384, "reward_std": 0.3665209412574768, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9034598469734192, "rewards/tag_count_reward/std": 0.23357515037059784, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1060.8795166015625, "completions/mean_terminated_length": 839.7213134765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.24846838207873848, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1291688686745437, "kl": 0.017852783203125, "learning_rate": 9.41181556457509e-07, "loss": 0.0828, "num_tokens": 703082582.0, "reward": 1.4056919813156128, "reward_std": 0.35266488790512085, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.2245887815952301, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1045.22998046875, "completions/mean_terminated_length": 813.8214721679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.24868147674604443, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12649082532650746, "kl": 0.0171661376953125, "learning_rate": 9.41015973976399e-07, "loss": 0.0982, "num_tokens": 703626109.0, "reward": 1.4168527126312256, "reward_std": 0.34334796667099, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.21320530772209167, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 982.9152221679688, "completions/mean_terminated_length": 754.888916015625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2488945714133504, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12389374550919054, "kl": 0.01812744140625, "learning_rate": 9.408501750978769e-07, "loss": 0.0522, "num_tokens": 704132567.0, "reward": 1.3543527126312256, "reward_std": 0.2990226745605469, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1695888340473175, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1116.59375, "completions/mean_terminated_length": 848.9482421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24910766608065632, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12005508477364267, "kl": 0.0162353515625, "learning_rate": 9.406841599137e-07, "loss": 0.0695, "num_tokens": 704702049.0, "reward": 1.4414063692092896, "reward_std": 0.30317384004592896, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18412980437278748, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1076.071533203125, "completions/mean_terminated_length": 831.7318115234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.24932076074796228, "frac_reward_zero_std": 0.0, "grad_norm": 0.1393368838053309, "kl": 0.019378662109375, "learning_rate": 9.40517928515745e-07, "loss": 0.067, "num_tokens": 705253601.0, "reward": 1.4157366752624512, "reward_std": 0.3868698179721832, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.21418675780296326, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1040.0960693359375, "completions/mean_terminated_length": 786.7122802734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24953385541526824, "frac_reward_zero_std": 0.0, "grad_norm": 0.13988853026545434, "kl": 0.018707275390625, "learning_rate": 9.403514809960081e-07, "loss": 0.1051, "num_tokens": 705792508.0, "reward": 1.3688616752624512, "reward_std": 0.3498436510562897, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.2114827185869217, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1134.790283203125, "completions/mean_terminated_length": 830.3869018554688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2497469500825742, "frac_reward_zero_std": 0.0, "grad_norm": 0.130087327815181, "kl": 0.017120361328125, "learning_rate": 9.401848174466053e-07, "loss": 0.0665, "num_tokens": 706371918.0, "reward": 1.2845982313156128, "reward_std": 0.40470418334007263, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24499371647834778, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 970.1094360351562, "completions/mean_terminated_length": 760.2799682617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.24996004474988012, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11770602808542026, "kl": 0.017486572265625, "learning_rate": 9.400179379597721e-07, "loss": 0.0579, "num_tokens": 706870303.0, "reward": 1.387834906578064, "reward_std": 0.26535508036613464, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19872502982616425, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1069.8973388671875, "completions/mean_terminated_length": 788.8333129882812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2501731394171861, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11319270923412346, "kl": 0.01739501953125, "learning_rate": 9.398508426278637e-07, "loss": 0.1043, "num_tokens": 707423409.0, "reward": 1.4062501192092896, "reward_std": 0.37307223677635193, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.21254922449588776, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 992.0558471679688, "completions/mean_terminated_length": 793.1909790039062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.250386234084492, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1334958717509608, "kl": 0.01873779296875, "learning_rate": 9.396835315433543e-07, "loss": 0.1403, "num_tokens": 707942842.0, "reward": 1.5390626192092896, "reward_std": 0.42992103099823, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.23065266013145447, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1025.9263916015625, "completions/mean_terminated_length": 779.609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.25059932875179797, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13944002728397867, "kl": 0.019287109375, "learning_rate": 9.395160047988379e-07, "loss": 0.074, "num_tokens": 708469945.0, "reward": 1.3521206378936768, "reward_std": 0.36226117610931396, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.2208217978477478, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1026.1295166015625, "completions/mean_terminated_length": 820.6595458984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2508124234191039, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12728507298197414, "kl": 0.017425537109375, "learning_rate": 9.393482624870281e-07, "loss": 0.0812, "num_tokens": 709001987.0, "reward": 1.4185268878936768, "reward_std": 0.3376017212867737, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.19246125221252441, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1014.8772583007812, "completions/mean_terminated_length": 817.045166015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2510255180864099, "frac_reward_zero_std": 0.0, "grad_norm": 0.13364980461477466, "kl": 0.0184326171875, "learning_rate": 9.391803047007567e-07, "loss": 0.0535, "num_tokens": 709527084.0, "reward": 1.4107143878936768, "reward_std": 0.2955436706542969, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.1875898689031601, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1134.2835693359375, "completions/mean_terminated_length": 904.5781860351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25123861275371584, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.113471643875866, "kl": 0.0142974853515625, "learning_rate": 9.39012131532976e-07, "loss": 0.1108, "num_tokens": 710106267.0, "reward": 1.4068081378936768, "reward_std": 0.3691440522670746, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.21175408363342285, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 953.1897583007812, "completions/mean_terminated_length": 750.4470825195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2514517074210218, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13565280406939878, "kl": 0.019195556640625, "learning_rate": 9.388437430767568e-07, "loss": 0.1158, "num_tokens": 710592608.0, "reward": 1.5184152126312256, "reward_std": 0.3573632538318634, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21910780668258667, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1042.8304443359375, "completions/mean_terminated_length": 850.3510131835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25166480208832775, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10860656602048822, "kl": 0.0178375244140625, "learning_rate": 9.386751394252895e-07, "loss": -0.0002, "num_tokens": 711136564.0, "reward": 1.5055804252624512, "reward_std": 0.3420940637588501, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.16209600865840912, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1039.09375, "completions/mean_terminated_length": 829.6981201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2518778967556337, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11561013449727518, "kl": 0.01983642578125, "learning_rate": 9.385063206718826e-07, "loss": 0.0633, "num_tokens": 711672670.0, "reward": 1.5396206378936768, "reward_std": 0.4023180603981018, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.20518478751182556, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1084.122802734375, "completions/mean_terminated_length": 821.2471923828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2520909914229396, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12212536123438834, "kl": 0.018310546875, "learning_rate": 9.383372869099652e-07, "loss": 0.0889, "num_tokens": 712222453.0, "reward": 1.4324777126312256, "reward_std": 0.3501165211200714, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20216916501522064, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1063.857177734375, "completions/mean_terminated_length": 816.4468994140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25230408609024557, "frac_reward_zero_std": 0.0, "grad_norm": 0.1422060690030813, "kl": 0.017974853515625, "learning_rate": 9.381680382330841e-07, "loss": 0.0958, "num_tokens": 712773205.0, "reward": 1.3744419813156128, "reward_std": 0.3827194571495056, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.23238559067249298, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1070.9576416015625, "completions/mean_terminated_length": 877.6390380859375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.25251718075755153, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12678636513120645, "kl": 0.0194091796875, "learning_rate": 9.379985747349056e-07, "loss": 0.0494, "num_tokens": 713324434.0, "reward": 1.4174107313156128, "reward_std": 0.3997945487499237, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.21109940111637115, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1033.575927734375, "completions/mean_terminated_length": 848.891845703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2527302754248575, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1291948979286699, "kl": 0.0198974609375, "learning_rate": 9.378288965092145e-07, "loss": 0.0511, "num_tokens": 713856740.0, "reward": 1.462053656578064, "reward_std": 0.3138492703437805, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18993109464645386, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1124.4398193359375, "completions/mean_terminated_length": 855.6224365234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25294337009216344, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13215387343909, "kl": 0.018035888671875, "learning_rate": 9.376590036499152e-07, "loss": 0.0893, "num_tokens": 714439977.0, "reward": 1.364397406578064, "reward_std": 0.3469241261482239, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.21394765377044678, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1063.265625, "completions/mean_terminated_length": 829.3232421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2531564647594694, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11914512375886392, "kl": 0.01751708984375, "learning_rate": 9.374888962510302e-07, "loss": 0.0596, "num_tokens": 714983792.0, "reward": 1.4218751192092896, "reward_std": 0.3380487561225891, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19915880262851715, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1105.6273193359375, "completions/mean_terminated_length": 862.0927124023438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.25336955942677536, "frac_reward_zero_std": 0.0, "grad_norm": 0.12119205690446866, "kl": 0.017578125, "learning_rate": 9.373185744067006e-07, "loss": 0.0383, "num_tokens": 715562233.0, "reward": 1.4977679252624512, "reward_std": 0.40225204825401306, "rewards/accuracy_reward/mean": 0.5949074029922485, "rewards/accuracy_reward/std": 0.49147912859916687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.2016134113073349, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1156.544677734375, "completions/mean_terminated_length": 900.3793334960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2535826540940813, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12164394110606176, "kl": 0.0157012939453125, "learning_rate": 9.371480382111869e-07, "loss": 0.0766, "num_tokens": 716144525.0, "reward": 1.2544643878936768, "reward_std": 0.3277755677700043, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839529275894165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20808550715446472, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1128.6942138671875, "completions/mean_terminated_length": 881.2889404296875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2537957487613872, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11431480954744451, "kl": 0.0168304443359375, "learning_rate": 9.369772877588679e-07, "loss": 0.0739, "num_tokens": 716714148.0, "reward": 1.3939732313156128, "reward_std": 0.41670045256614685, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8805803656578064, "rewards/tag_count_reward/std": 0.2616889178752899, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1154.118408203125, "completions/mean_terminated_length": 890.60400390625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2540088434286932, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12084179441173191, "kl": 0.017303466796875, "learning_rate": 9.368063231442404e-07, "loss": 0.0827, "num_tokens": 717299593.0, "reward": 1.4101563692092896, "reward_std": 0.3786208927631378, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.21001364290714264, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 892.4263916015625, "completions/mean_terminated_length": 734.0482177734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.25422193809599913, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15501915749431122, "kl": 0.021697998046875, "learning_rate": 9.366351444619207e-07, "loss": 0.093, "num_tokens": 717761608.0, "reward": 1.6021206378936768, "reward_std": 0.3924897611141205, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.2047584503889084, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1093.5045166015625, "completions/mean_terminated_length": 873.2362670898438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2544350327633051, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11945032285356612, "kl": 0.0194091796875, "learning_rate": 9.364637518066431e-07, "loss": 0.0961, "num_tokens": 718315482.0, "reward": 1.4040179252624512, "reward_std": 0.32677170634269714, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18845300376415253, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1028.546875, "completions/mean_terminated_length": 816.9622192382812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25464812743061105, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12288813369377476, "kl": 0.0185546875, "learning_rate": 9.362921452732598e-07, "loss": 0.0877, "num_tokens": 718845743.0, "reward": 1.5172991752624512, "reward_std": 0.34804025292396545, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9123883843421936, "rewards/tag_count_reward/std": 0.2212115377187729, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1042.97998046875, "completions/mean_terminated_length": 863.13427734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.254861222097917, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1275499782028147, "kl": 0.020050048828125, "learning_rate": 9.361203249567424e-07, "loss": 0.0951, "num_tokens": 719383430.0, "reward": 1.4687501192092896, "reward_std": 0.33225592970848083, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.2053801566362381, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1053.2366943359375, "completions/mean_terminated_length": 862.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.25507431676522296, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13040743814687228, "kl": 0.019287109375, "learning_rate": 9.359482909521802e-07, "loss": 0.1084, "num_tokens": 719922608.0, "reward": 1.442522406578064, "reward_std": 0.37619295716285706, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.2044655829668045, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1188.607177734375, "completions/mean_terminated_length": 938.4668579101562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2552874114325289, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12890735174445375, "kl": 0.01708984375, "learning_rate": 9.357760433547807e-07, "loss": 0.1283, "num_tokens": 720528560.0, "reward": 1.446428656578064, "reward_std": 0.3331867754459381, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9084821343421936, "rewards/tag_count_reward/std": 0.21866366267204285, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1098.029052734375, "completions/mean_terminated_length": 888.3623657226562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2555005060998349, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11050528131623757, "kl": 0.017822265625, "learning_rate": 9.356035822598699e-07, "loss": 0.0727, "num_tokens": 721097149.0, "reward": 1.5301339626312256, "reward_std": 0.30820971727371216, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.18505382537841797, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1133.0491943359375, "completions/mean_terminated_length": 903.0335083007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2557136007671408, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11803966505992211, "kl": 0.01708984375, "learning_rate": 9.354309077628918e-07, "loss": 0.1082, "num_tokens": 721690483.0, "reward": 1.3437501192092896, "reward_std": 0.3707781136035919, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.2252444326877594, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1081.625, "completions/mean_terminated_length": 848.7312622070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.25592669543444674, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3887678034627187, "kl": 0.020965576171875, "learning_rate": 9.352580199594084e-07, "loss": 0.0412, "num_tokens": 722252747.0, "reward": 1.3688616752624512, "reward_std": 0.36884328722953796, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.21372579038143158, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 977.9063110351562, "completions/mean_terminated_length": 748.8076171875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2561397901017527, "frac_reward_zero_std": 0.0, "grad_norm": 0.136920335796482, "kl": 0.019012451171875, "learning_rate": 9.350849189451e-07, "loss": 0.0558, "num_tokens": 722761137.0, "reward": 1.575334906578064, "reward_std": 0.3258565664291382, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969305515289307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.1958458423614502, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1108.140625, "completions/mean_terminated_length": 875.1392822265625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.25635288476905865, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12358339108355751, "kl": 0.01861572265625, "learning_rate": 9.349116048157645e-07, "loss": 0.0524, "num_tokens": 723329440.0, "reward": 1.3973214626312256, "reward_std": 0.384210467338562, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1891935020685196, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1137.243408203125, "completions/mean_terminated_length": 875.5316162109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2565659794363646, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12183898447172595, "kl": 0.018310546875, "learning_rate": 9.347380776673185e-07, "loss": 0.1109, "num_tokens": 723905261.0, "reward": 1.3984376192092896, "reward_std": 0.3694113492965698, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.21083606779575348, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 992.22998046875, "completions/mean_terminated_length": 813.05224609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25677907410367057, "frac_reward_zero_std": 0.0, "grad_norm": 0.13564871199570044, "kl": 0.018035888671875, "learning_rate": 9.345643375957955e-07, "loss": 0.083, "num_tokens": 724410628.0, "reward": 1.4375001192092896, "reward_std": 0.3085968494415283, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9129464030265808, "rewards/tag_count_reward/std": 0.20942506194114685, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 956.19873046875, "completions/mean_terminated_length": 780.8316040039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2569921687709765, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13642613669603001, "kl": 0.02191162109375, "learning_rate": 9.343903846973475e-07, "loss": 0.0887, "num_tokens": 724905117.0, "reward": 1.4575893878936768, "reward_std": 0.31205737590789795, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.16782118380069733, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 918.232177734375, "completions/mean_terminated_length": 719.55908203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2572052634382825, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14826309943318305, "kl": 0.021240234375, "learning_rate": 9.342162190682442e-07, "loss": 0.061, "num_tokens": 725382805.0, "reward": 1.5055804252624512, "reward_std": 0.3464234471321106, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18508079648017883, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 963.1741333007812, "completions/mean_terminated_length": 792.180908203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2574183581055884, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12389017570976057, "kl": 0.0206298828125, "learning_rate": 9.34041840804873e-07, "loss": 0.0751, "num_tokens": 725883843.0, "reward": 1.6411831378936768, "reward_std": 0.3124764859676361, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137671947479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17976601421833038, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1035.649658203125, "completions/mean_terminated_length": 818.9132690429688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.25763145277289434, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.22205956941423924, "kl": 0.024169921875, "learning_rate": 9.338672500037387e-07, "loss": 0.104, "num_tokens": 726424582.0, "reward": 1.4380581378936768, "reward_std": 0.29289552569389343, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.21241335570812225, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1056.3795166015625, "completions/mean_terminated_length": 853.7903442382812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2578445474402003, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12472590543238136, "kl": 0.0186767578125, "learning_rate": 9.336924467614641e-07, "loss": 0.0733, "num_tokens": 726969856.0, "reward": 1.4006696939468384, "reward_std": 0.34661921858787537, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.20614159107208252, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1101.87060546875, "completions/mean_terminated_length": 864.0167236328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25805764210750626, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1182586249231736, "kl": 0.017425537109375, "learning_rate": 9.335174311747893e-07, "loss": 0.0584, "num_tokens": 727536406.0, "reward": 1.4760044813156128, "reward_std": 0.3336218595504761, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.1743152141571045, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1129.779052734375, "completions/mean_terminated_length": 908.4902954101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2582707367748122, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11041467398894227, "kl": 0.018768310546875, "learning_rate": 9.333422033405722e-07, "loss": 0.0602, "num_tokens": 728113155.0, "reward": 1.4414063692092896, "reward_std": 0.3210981488227844, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.22768031060695648, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1057.1920166015625, "completions/mean_terminated_length": 854.768798828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2584838314421182, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13459729994573613, "kl": 0.019073486328125, "learning_rate": 9.331667633557877e-07, "loss": 0.1247, "num_tokens": 728656825.0, "reward": 1.3705357313156128, "reward_std": 0.34038785099983215, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8995535969734192, "rewards/tag_count_reward/std": 0.2462465763092041, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1151.899658203125, "completions/mean_terminated_length": 842.4354858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25869692610942413, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12343725773168232, "kl": 0.01708984375, "learning_rate": 9.329911113175289e-07, "loss": 0.0654, "num_tokens": 729242588.0, "reward": 1.3465402126312256, "reward_std": 0.3305869996547699, "rewards/accuracy_reward/mean": 0.45370370149612427, "rewards/accuracy_reward/std": 0.49842923879623413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23580926656723022, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1143.0826416015625, "completions/mean_terminated_length": 918.7437133789062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2589100207767301, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10932401625998242, "kl": 0.0166473388671875, "learning_rate": 9.328152473230052e-07, "loss": 0.1034, "num_tokens": 729825553.0, "reward": 1.4380581378936768, "reward_std": 0.30797672271728516, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.18520469963550568, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1006.9397583007812, "completions/mean_terminated_length": 804.2799682617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.259123115444036, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1362992922962979, "kl": 0.019622802734375, "learning_rate": 9.326391714695443e-07, "loss": 0.0845, "num_tokens": 730350374.0, "reward": 1.5117188692092896, "reward_std": 0.34411683678627014, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19555239379405975, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1060.712158203125, "completions/mean_terminated_length": 812.5111694335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25933621011134195, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12419567343749208, "kl": 0.0175018310546875, "learning_rate": 9.324628838545905e-07, "loss": 0.0888, "num_tokens": 730890133.0, "reward": 1.5161831378936768, "reward_std": 0.4181612730026245, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22583600878715515, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1004.6607666015625, "completions/mean_terminated_length": 804.872314453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2595493047786479, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13049072762405026, "kl": 0.02130126953125, "learning_rate": 9.322863845757054e-07, "loss": 0.0892, "num_tokens": 731411229.0, "reward": 1.3638393878936768, "reward_std": 0.3830179274082184, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.22072072327136993, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1075.953125, "completions/mean_terminated_length": 898.9841918945312, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.25976239944595386, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12467741413432065, "kl": 0.018829345703125, "learning_rate": 9.321096737305679e-07, "loss": 0.0853, "num_tokens": 731961240.0, "reward": 1.5061384439468384, "reward_std": 0.3604619801044464, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.21286541223526, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1115.8192138671875, "completions/mean_terminated_length": 833.9970703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2599754941132598, "frac_reward_zero_std": 0.0, "grad_norm": 0.8438915871257744, "kl": 0.086090087890625, "learning_rate": 9.319327514169742e-07, "loss": 0.0851, "num_tokens": 732530599.0, "reward": 1.4224331378936768, "reward_std": 0.39743557572364807, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8777901530265808, "rewards/tag_count_reward/std": 0.2574244737625122, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1021.7098388671875, "completions/mean_terminated_length": 838.0579223632812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2601885887805658, "frac_reward_zero_std": 0.0, "grad_norm": 0.14065909553917508, "kl": 0.0198974609375, "learning_rate": 9.31755617732837e-07, "loss": 0.0705, "num_tokens": 733059701.0, "reward": 1.493303656578064, "reward_std": 0.3630177080631256, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19200991094112396, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1029.7410888671875, "completions/mean_terminated_length": 824.9973754882812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.26040168344787173, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1350408112016386, "kl": 0.0190582275390625, "learning_rate": 9.315782727761861e-07, "loss": 0.061, "num_tokens": 733585457.0, "reward": 1.4274554252624512, "reward_std": 0.28504908084869385, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20284497737884521, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1132.727783203125, "completions/mean_terminated_length": 845.53076171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2606147781151777, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13741809612190153, "kl": 0.01788330078125, "learning_rate": 9.314007166451688e-07, "loss": 0.146, "num_tokens": 734160903.0, "reward": 1.2801339626312256, "reward_std": 0.3947989046573639, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.25560733675956726, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1179.359375, "completions/mean_terminated_length": 920.026123046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2608278727824836, "frac_reward_zero_std": 0.0, "grad_norm": 0.12950320030316076, "kl": 0.0167236328125, "learning_rate": 9.312229494380485e-07, "loss": 0.0808, "num_tokens": 734755656.0, "reward": 1.4525669813156128, "reward_std": 0.38254234194755554, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9123883843421936, "rewards/tag_count_reward/std": 0.22434963285923004, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1040.930908203125, "completions/mean_terminated_length": 838.43701171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26104096744978955, "frac_reward_zero_std": 0.0, "grad_norm": 0.13226573877139045, "kl": 0.0194091796875, "learning_rate": 9.310449712532058e-07, "loss": 0.1033, "num_tokens": 735293721.0, "reward": 1.5407366752624512, "reward_std": 0.30943822860717773, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.19310477375984192, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1183.575927734375, "completions/mean_terminated_length": 981.1625366210938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2612540621170955, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11186234345853083, "kl": 0.017730712890625, "learning_rate": 9.308667821891381e-07, "loss": 0.1347, "num_tokens": 735895163.0, "reward": 1.3789063692092896, "reward_std": 0.4183768630027771, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8833705186843872, "rewards/tag_count_reward/std": 0.26951199769973755, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1216.165283203125, "completions/mean_terminated_length": 970.9421997070312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26146715678440147, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11737919478591904, "kl": 0.016510009765625, "learning_rate": 9.306883823444592e-07, "loss": 0.076, "num_tokens": 736516693.0, "reward": 1.3666294813156128, "reward_std": 0.3469386398792267, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2239709198474884, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1147.203125, "completions/mean_terminated_length": 927.0083618164062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2616802514517074, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10692977032712898, "kl": 0.017364501953125, "learning_rate": 9.305097718178999e-07, "loss": 0.0519, "num_tokens": 737102672.0, "reward": 1.4408482313156128, "reward_std": 0.3874766528606415, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.23455984890460968, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1013.044677734375, "completions/mean_terminated_length": 827.8421630859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2618933461190134, "frac_reward_zero_std": 0.0, "grad_norm": 0.12712121772950633, "kl": 0.020782470703125, "learning_rate": 9.303309507083074e-07, "loss": 0.093, "num_tokens": 737632948.0, "reward": 1.4531251192092896, "reward_std": 0.4119478166103363, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8950892686843872, "rewards/tag_count_reward/std": 0.2374086230993271, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1068.3170166015625, "completions/mean_terminated_length": 928.3622436523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26210644078631934, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11498952459920828, "kl": 0.01800537109375, "learning_rate": 9.301519191146457e-07, "loss": 0.0481, "num_tokens": 738180082.0, "reward": 1.5513393878936768, "reward_std": 0.3388901650905609, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.16445478796958923, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1116.1273193359375, "completions/mean_terminated_length": 897.9201049804688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2623195354536253, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.22290054873954854, "kl": 0.018402099609375, "learning_rate": 9.299726771359947e-07, "loss": 0.1197, "num_tokens": 738749867.0, "reward": 1.368303656578064, "reward_std": 0.37348824739456177, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.23410436511039734, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 977.6250610351562, "completions/mean_terminated_length": 808.9096069335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2625326301209312, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12189476131660924, "kl": 0.02105712890625, "learning_rate": 9.297932248715515e-07, "loss": 0.0258, "num_tokens": 739255379.0, "reward": 1.5652902126312256, "reward_std": 0.32277098298072815, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.1828029304742813, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1062.790283203125, "completions/mean_terminated_length": 815.1116943359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.26274572478823716, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1259967988638335, "kl": 0.016937255859375, "learning_rate": 9.29613562420629e-07, "loss": -0.0015, "num_tokens": 739803493.0, "reward": 1.3922991752624512, "reward_std": 0.279218852519989, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.19506020843982697, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1078.1763916015625, "completions/mean_terminated_length": 886.2861328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2629588194555431, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12569743507603784, "kl": 0.017303466796875, "learning_rate": 9.294336898826566e-07, "loss": 0.1075, "num_tokens": 740356548.0, "reward": 1.399553656578064, "reward_std": 0.3404173254966736, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.2329067587852478, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1093.59375, "completions/mean_terminated_length": 856.986083984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26317191412284907, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11675731763920931, "kl": 0.017425537109375, "learning_rate": 9.2925360735718e-07, "loss": 0.0371, "num_tokens": 740916094.0, "reward": 1.4598214626312256, "reward_std": 0.3263718783855438, "rewards/accuracy_reward/mean": 0.5578703880310059, "rewards/accuracy_reward/std": 0.49721553921699524, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2142340987920761, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1127.372802734375, "completions/mean_terminated_length": 886.1943359375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.26338500879015503, "frac_reward_zero_std": 0.0, "grad_norm": 0.1101613500676768, "kl": 0.016510009765625, "learning_rate": 9.290733149438611e-07, "loss": 0.0569, "num_tokens": 741499461.0, "reward": 1.4397321939468384, "reward_std": 0.3753872811794281, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.20800147950649261, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1105.419677734375, "completions/mean_terminated_length": 878.2603759765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.263598103457461, "frac_reward_zero_std": 0.0, "grad_norm": 0.12431351426306203, "kl": 0.018768310546875, "learning_rate": 9.288928127424781e-07, "loss": 0.0796, "num_tokens": 742063025.0, "reward": 1.4765626192092896, "reward_std": 0.37347933650016785, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9029017686843872, "rewards/tag_count_reward/std": 0.22757954895496368, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1112.8460693359375, "completions/mean_terminated_length": 881.0111083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.26381119812476694, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12892978323328838, "kl": 0.018035888671875, "learning_rate": 9.28712100852925e-07, "loss": 0.0685, "num_tokens": 742628908.0, "reward": 1.4430804252624512, "reward_std": 0.33087581396102905, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22092141211032867, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 950.43310546875, "completions/mean_terminated_length": 806.30810546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2640242927920729, "frac_reward_zero_std": 0.0, "grad_norm": 0.14815803607912337, "kl": 0.019927978515625, "learning_rate": 9.285311793752119e-07, "loss": 0.1113, "num_tokens": 743123982.0, "reward": 1.4609376192092896, "reward_std": 0.4009788930416107, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.21947002410888672, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1087.321533203125, "completions/mean_terminated_length": 845.81005859375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2642373874593788, "frac_reward_zero_std": 0.0, "grad_norm": 0.1160957619740128, "kl": 0.017608642578125, "learning_rate": 9.283500484094652e-07, "loss": 0.0478, "num_tokens": 743685038.0, "reward": 1.442522406578064, "reward_std": 0.361387699842453, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.21535509824752808, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1077.97998046875, "completions/mean_terminated_length": 820.4039306640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26445048212668476, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258940088707082, "kl": 0.0165252685546875, "learning_rate": 9.281687080559269e-07, "loss": 0.1007, "num_tokens": 744238037.0, "reward": 1.3794643878936768, "reward_std": 0.3933660089969635, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9084821343421936, "rewards/tag_count_reward/std": 0.22805356979370117, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1093.341552734375, "completions/mean_terminated_length": 846.6320190429688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2646635767939907, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11364775455527655, "kl": 0.01904296875, "learning_rate": 9.27987158414955e-07, "loss": 0.0435, "num_tokens": 744795806.0, "reward": 1.4045759439468384, "reward_std": 0.27130869030952454, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.2348809540271759, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1062.997802734375, "completions/mean_terminated_length": 855.3486938476562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2648766714612967, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.128938650186391, "kl": 0.0169677734375, "learning_rate": 9.278053995870235e-07, "loss": 0.0871, "num_tokens": 745346781.0, "reward": 1.3934152126312256, "reward_std": 0.3504249155521393, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19479762017726898, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1062.3929443359375, "completions/mean_terminated_length": 800.677978515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26508976612860263, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1273156715140467, "kl": 0.017120361328125, "learning_rate": 9.276234316727217e-07, "loss": 0.0741, "num_tokens": 745889469.0, "reward": 1.3649554252624512, "reward_std": 0.3606833815574646, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.22482003271579742, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1043.7366943359375, "completions/mean_terminated_length": 845.1925048828125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2653028607959086, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13099118791818068, "kl": 0.01849365234375, "learning_rate": 9.274412547727552e-07, "loss": 0.0856, "num_tokens": 746437239.0, "reward": 1.5072544813156128, "reward_std": 0.32017356157302856, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21763859689235687, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1022.4732666015625, "completions/mean_terminated_length": 782.3361206054688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.26551595546321455, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.41915369366731653, "kl": 0.021942138671875, "learning_rate": 9.272588689879447e-07, "loss": 0.0766, "num_tokens": 746972219.0, "reward": 1.364397406578064, "reward_std": 0.27661991119384766, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16235284507274628, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1033.7410888671875, "completions/mean_terminated_length": 836.2986450195312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2657290501305205, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12305629804651923, "kl": 0.019500732421875, "learning_rate": 9.270762744192271e-07, "loss": 0.0749, "num_tokens": 747497351.0, "reward": 1.5256696939468384, "reward_std": 0.2930222451686859, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4846842288970947, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.20802249014377594, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1079.3304443359375, "completions/mean_terminated_length": 893.8403930664062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.2659421447978264, "frac_reward_zero_std": 0.0, "grad_norm": 0.12347060711186815, "kl": 0.019287109375, "learning_rate": 9.268934711676543e-07, "loss": 0.0716, "num_tokens": 748046443.0, "reward": 1.3627232313156128, "reward_std": 0.3684873878955841, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.2357490509748459, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1047.8348388671875, "completions/mean_terminated_length": 865.7467651367188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.26615523946513236, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1294320809294041, "kl": 0.0160675048828125, "learning_rate": 9.267104593343938e-07, "loss": 0.1051, "num_tokens": 748585777.0, "reward": 1.3264509439468384, "reward_std": 0.3352195918560028, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.18351179361343384, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1025.5982666015625, "completions/mean_terminated_length": 867.4948120117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2663683341324383, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.34941743444332113, "kl": 0.020477294921875, "learning_rate": 9.265272390207289e-07, "loss": 0.091, "num_tokens": 749121021.0, "reward": 1.4090402126312256, "reward_std": 0.304977148771286, "rewards/accuracy_reward/mean": 0.4930555522441864, "rewards/accuracy_reward/std": 0.5005313754081726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19407851994037628, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1050.649658203125, "completions/mean_terminated_length": 878.3324584960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2665814287997443, "frac_reward_zero_std": 0.0, "grad_norm": 0.1414529674591582, "kl": 0.019805908203125, "learning_rate": 9.263438103280579e-07, "loss": 0.1249, "num_tokens": 749661584.0, "reward": 1.4810268878936768, "reward_std": 0.37198108434677124, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.19511540234088898, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 994.32373046875, "completions/mean_terminated_length": 815.5013427734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26679452346705024, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13613454586536913, "kl": 0.020843505859375, "learning_rate": 9.261601733578945e-07, "loss": 0.1247, "num_tokens": 750170801.0, "reward": 1.4810268878936768, "reward_std": 0.2909072935581207, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.1993686854839325, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 956.24560546875, "completions/mean_terminated_length": 754.0687866210938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2670076181343562, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14141598231871455, "kl": 0.019561767578125, "learning_rate": 9.259763282118678e-07, "loss": 0.1098, "num_tokens": 750673199.0, "reward": 1.4681919813156128, "reward_std": 0.34307199716567993, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.2075621485710144, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1048.075927734375, "completions/mean_terminated_length": 866.0316772460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26722071280166215, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13368042300137775, "kl": 0.0194091796875, "learning_rate": 9.25792274991722e-07, "loss": 0.1038, "num_tokens": 751205697.0, "reward": 1.575334906578064, "reward_std": 0.3807145059108734, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.20061947405338287, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 961.26123046875, "completions/mean_terminated_length": 753.1622314453125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2674338074689681, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1355603832274258, "kl": 0.0194091796875, "learning_rate": 9.256080137993164e-07, "loss": 0.0567, "num_tokens": 751703558.0, "reward": 1.5446429252624512, "reward_std": 0.29710811376571655, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.1737278550863266, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1038.8460693359375, "completions/mean_terminated_length": 792.1638793945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26764690213627407, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12704748606158775, "kl": 0.017120361328125, "learning_rate": 9.254235447366254e-07, "loss": 0.0891, "num_tokens": 752235729.0, "reward": 1.4330357313156128, "reward_std": 0.35093873739242554, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21488575637340546, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 976.8460083007812, "completions/mean_terminated_length": 854.276123046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26785999680357997, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13369863209014135, "kl": 0.021636962890625, "learning_rate": 9.252388679057388e-07, "loss": 0.0251, "num_tokens": 752739420.0, "reward": 1.4804688692092896, "reward_std": 0.27190059423446655, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.15670275688171387, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1100.94873046875, "completions/mean_terminated_length": 839.2279052734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2680730914708859, "frac_reward_zero_std": 0.0, "grad_norm": 0.1425194209591029, "kl": 0.018280029296875, "learning_rate": 9.250539834088608e-07, "loss": 0.1103, "num_tokens": 753301413.0, "reward": 1.3359376192092896, "reward_std": 0.2884243428707123, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.18353645503520966, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1012.58935546875, "completions/mean_terminated_length": 790.916015625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2682861861381919, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12886905502422835, "kl": 0.0189208984375, "learning_rate": 9.248688913483109e-07, "loss": 0.0602, "num_tokens": 753827917.0, "reward": 1.4877232313156128, "reward_std": 0.3074907958507538, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9095982313156128, "rewards/tag_count_reward/std": 0.23513396084308624, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1021.4085083007812, "completions/mean_terminated_length": 865.704345703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.26849928080549784, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11814261975111655, "kl": 0.0203857421875, "learning_rate": 9.246835918265235e-07, "loss": 0.0556, "num_tokens": 754353108.0, "reward": 1.4642857313156128, "reward_std": 0.28475141525268555, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15519075095653534, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1044.2054443359375, "completions/mean_terminated_length": 832.5946044921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2687123754728038, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14014850618227453, "kl": 0.020599365234375, "learning_rate": 9.244980849460475e-07, "loss": 0.1386, "num_tokens": 754891408.0, "reward": 1.4146206378936768, "reward_std": 0.3818550705909729, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.23655983805656433, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1078.78125, "completions/mean_terminated_length": 800.2701416015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26892547014010976, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14095879539279002, "kl": 0.016998291015625, "learning_rate": 9.243123708095469e-07, "loss": 0.0881, "num_tokens": 755447294.0, "reward": 1.3599331378936768, "reward_std": 0.28407299518585205, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2073153853416443, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1074.3638916015625, "completions/mean_terminated_length": 822.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2691385648074157, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13144982862168636, "kl": 0.01800537109375, "learning_rate": 9.241264495198003e-07, "loss": 0.0632, "num_tokens": 756001361.0, "reward": 1.2723214626312256, "reward_std": 0.29284587502479553, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.22655968368053436, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1111.5826416015625, "completions/mean_terminated_length": 879.4345092773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26935165947472167, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12932675287384024, "kl": 0.019195556640625, "learning_rate": 9.239403211797007e-07, "loss": 0.0883, "num_tokens": 756566182.0, "reward": 1.3911831378936768, "reward_std": 0.4241245985031128, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.22101959586143494, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1078.65185546875, "completions/mean_terminated_length": 871.1219482421875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2695647541420276, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13351338370829988, "kl": 0.01739501953125, "learning_rate": 9.23753985892256e-07, "loss": 0.1377, "num_tokens": 757119658.0, "reward": 1.4068081378936768, "reward_std": 0.3259485363960266, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21524494886398315, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 969.27685546875, "completions/mean_terminated_length": 782.9005126953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.26977784880933353, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1445523923289593, "kl": 0.019287109375, "learning_rate": 9.235674437605887e-07, "loss": 0.1158, "num_tokens": 757624134.0, "reward": 1.5546876192092896, "reward_std": 0.3644367754459381, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18195155262947083, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1012.3772583007812, "completions/mean_terminated_length": 830.2598266601562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2699909434766395, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11426376389846563, "kl": 0.019683837890625, "learning_rate": 9.233806948879354e-07, "loss": 0.0975, "num_tokens": 758142479.0, "reward": 1.4704241752624512, "reward_std": 0.3003813326358795, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19155997037887573, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1028.984375, "completions/mean_terminated_length": 859.1484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.27020403814394545, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12201104481683449, "kl": 0.019073486328125, "learning_rate": 9.231937393776474e-07, "loss": 0.1039, "num_tokens": 758668104.0, "reward": 1.5295759439468384, "reward_std": 0.36585596203804016, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17343208193778992, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1036.5826416015625, "completions/mean_terminated_length": 846.1034545898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2704171328112514, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.26527561567332636, "kl": 0.023193359375, "learning_rate": 9.2300657733319e-07, "loss": 0.0225, "num_tokens": 759193869.0, "reward": 1.4481027126312256, "reward_std": 0.3217213749885559, "rewards/accuracy_reward/mean": 0.5393518805503845, "rewards/accuracy_reward/std": 0.49902695417404175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.1949641853570938, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1002.1183471679688, "completions/mean_terminated_length": 824.6188354492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27063022747855736, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12381739915452857, "kl": 0.01800537109375, "learning_rate": 9.228192088581434e-07, "loss": 0.0545, "num_tokens": 759722882.0, "reward": 1.4877232313156128, "reward_std": 0.359967440366745, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1804911494255066, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1030.12060546875, "completions/mean_terminated_length": 784.8143920898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2708433221458633, "frac_reward_zero_std": 0.0, "grad_norm": 1.3405075545297052, "kl": 0.07879638671875, "learning_rate": 9.226316340562015e-07, "loss": 0.0541, "num_tokens": 760257608.0, "reward": 1.5474331378936768, "reward_std": 0.37388885021209717, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.19193758070468903, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 997.607177734375, "completions/mean_terminated_length": 825.7246704101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2710564168131693, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11600665764022257, "kl": 0.018524169921875, "learning_rate": 9.224438530311727e-07, "loss": 0.0999, "num_tokens": 760775608.0, "reward": 1.5044643878936768, "reward_std": 0.30424150824546814, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.1718057543039322, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1039.1273193359375, "completions/mean_terminated_length": 809.7123413085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2712695114804752, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1343517579353277, "kl": 0.022735595703125, "learning_rate": 9.222558658869794e-07, "loss": 0.0575, "num_tokens": 761313473.0, "reward": 1.4341518878936768, "reward_std": 0.33831170201301575, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.20031820237636566, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1163.9420166015625, "completions/mean_terminated_length": 896.6685791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.27148260614778114, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10241693365593539, "kl": 0.018218994140625, "learning_rate": 9.22067672727658e-07, "loss": 0.0514, "num_tokens": 761903671.0, "reward": 1.4335938692092896, "reward_std": 0.34236106276512146, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.1882467120885849, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1054.919677734375, "completions/mean_terminated_length": 892.4155883789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2716957008150871, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12789871144584183, "kl": 0.018096923828125, "learning_rate": 9.218792736573592e-07, "loss": 0.0549, "num_tokens": 762454003.0, "reward": 1.5496652126312256, "reward_std": 0.3572940528392792, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.18043838441371918, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1038.259033203125, "completions/mean_terminated_length": 838.4705810546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27190879548239305, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1346422106752444, "kl": 0.02032470703125, "learning_rate": 9.216906687803475e-07, "loss": 0.1228, "num_tokens": 762990359.0, "reward": 1.3476563692092896, "reward_std": 0.3108026087284088, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17351123690605164, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1064.58935546875, "completions/mean_terminated_length": 803.4576416015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.272121890149699, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13124513376721517, "kl": 0.018585205078125, "learning_rate": 9.215018582010008e-07, "loss": 0.0352, "num_tokens": 763541215.0, "reward": 1.3303571939468384, "reward_std": 0.3550775647163391, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19285328686237335, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1048.43310546875, "completions/mean_terminated_length": 824.486328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27233498481700497, "frac_reward_zero_std": 0.0, "grad_norm": 0.1229452916585429, "kl": 0.01983642578125, "learning_rate": 9.213128420238119e-07, "loss": 0.0637, "num_tokens": 764082129.0, "reward": 1.5044643878936768, "reward_std": 0.31275925040245056, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19418221712112427, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1047.3817138671875, "completions/mean_terminated_length": 833.1572265625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2725480794843109, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15425825221339634, "kl": 0.019927978515625, "learning_rate": 9.211236203533864e-07, "loss": 0.0787, "num_tokens": 764619340.0, "reward": 1.446428656578064, "reward_std": 0.3318825960159302, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19710472226142883, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1034.4241943359375, "completions/mean_terminated_length": 843.5384521484375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2727611741516169, "frac_reward_zero_std": 0.0, "grad_norm": 0.12728788961953486, "kl": 0.0191650390625, "learning_rate": 9.209341932944441e-07, "loss": 0.1464, "num_tokens": 765151098.0, "reward": 1.4441964626312256, "reward_std": 0.2830570936203003, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.208564892411232, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1081.7523193359375, "completions/mean_terminated_length": 881.210205078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2729742688189228, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12452789166596236, "kl": 0.02044677734375, "learning_rate": 9.207445609518185e-07, "loss": 0.1034, "num_tokens": 765710651.0, "reward": 1.4693081378936768, "reward_std": 0.3502883017063141, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.18375654518604279, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1040.3616943359375, "completions/mean_terminated_length": 840.9893188476562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.27318736348622874, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13767967117707908, "kl": 0.01983642578125, "learning_rate": 9.205547234304563e-07, "loss": 0.1237, "num_tokens": 766247853.0, "reward": 1.4977679252624512, "reward_std": 0.4095991253852844, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9129464030265808, "rewards/tag_count_reward/std": 0.23101231455802917, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1080.1116943359375, "completions/mean_terminated_length": 819.6317138671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2734004581535347, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1527045496459597, "kl": 0.01776123046875, "learning_rate": 9.203646808354185e-07, "loss": 0.0951, "num_tokens": 766814543.0, "reward": 1.4827009439468384, "reward_std": 0.37107545137405396, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.23668117821216583, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1110.21435546875, "completions/mean_terminated_length": 915.5794677734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27361355282084066, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12547394198569417, "kl": 0.021728515625, "learning_rate": 9.201744332718787e-07, "loss": 0.0363, "num_tokens": 767378655.0, "reward": 1.4732143878936768, "reward_std": 0.32989761233329773, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.22120662033557892, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1116.5491943359375, "completions/mean_terminated_length": 875.8370971679688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2738266474881466, "frac_reward_zero_std": 0.0, "grad_norm": 0.1322886581869743, "kl": 0.017669677734375, "learning_rate": 9.199839808451244e-07, "loss": 0.0818, "num_tokens": 767948501.0, "reward": 1.4029018878936768, "reward_std": 0.3874658942222595, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.21812346577644348, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1135.3773193359375, "completions/mean_terminated_length": 866.338134765625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.27403974215545257, "frac_reward_zero_std": 0.0, "grad_norm": 0.13280638447083, "kl": 0.0177001953125, "learning_rate": 9.197933236605568e-07, "loss": 0.0949, "num_tokens": 768526990.0, "reward": 1.4185268878936768, "reward_std": 0.3754233419895172, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24003124237060547, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1002.4397583007812, "completions/mean_terminated_length": 802.2260131835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27425283682275853, "frac_reward_zero_std": 0.0, "grad_norm": 0.14771938410677754, "kl": 0.019134521484375, "learning_rate": 9.196024618236898e-07, "loss": 0.1237, "num_tokens": 769038755.0, "reward": 1.5161831378936768, "reward_std": 0.3286365568637848, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20777854323387146, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1004.9866333007812, "completions/mean_terminated_length": 821.569580078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2744659314900645, "frac_reward_zero_std": 0.0, "grad_norm": 0.126777309169689, "kl": 0.019775390625, "learning_rate": 9.194113954401507e-07, "loss": 0.0457, "num_tokens": 769556573.0, "reward": 1.4771206378936768, "reward_std": 0.36081451177597046, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22157806158065796, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1101.868408203125, "completions/mean_terminated_length": 886.7205810546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2746790261573704, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13175449661959177, "kl": 0.019561767578125, "learning_rate": 9.192201246156804e-07, "loss": 0.0822, "num_tokens": 770113426.0, "reward": 1.3945313692092896, "reward_std": 0.3666723966598511, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9012276530265808, "rewards/tag_count_reward/std": 0.236807718873024, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1077.90625, "completions/mean_terminated_length": 857.3096313476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27489212082467634, "frac_reward_zero_std": 0.0, "grad_norm": 0.13093331823566642, "kl": 0.02020263671875, "learning_rate": 9.190286494561324e-07, "loss": 0.065, "num_tokens": 770665448.0, "reward": 1.4564732313156128, "reward_std": 0.3912244737148285, "rewards/accuracy_reward/mean": 0.5694444179534912, "rewards/accuracy_reward/std": 0.495728075504303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.23004567623138428, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1046.16748046875, "completions/mean_terminated_length": 854.3270874023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2751052154919823, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13619036697238843, "kl": 0.02032470703125, "learning_rate": 9.188369700674735e-07, "loss": 0.0764, "num_tokens": 771199907.0, "reward": 1.5212054252624512, "reward_std": 0.35409578680992126, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2190144956111908, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1152.046875, "completions/mean_terminated_length": 881.1773071289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27531831015928826, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1342892072882594, "kl": 0.017669677734375, "learning_rate": 9.18645086555784e-07, "loss": 0.0817, "num_tokens": 771784248.0, "reward": 1.4023438692092896, "reward_std": 0.34993189573287964, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.22639364004135132, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1126.3192138671875, "completions/mean_terminated_length": 913.6236572265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2755314048265942, "frac_reward_zero_std": 0.0, "grad_norm": 0.12335500836179189, "kl": 0.017242431640625, "learning_rate": 9.184529990272564e-07, "loss": 0.0463, "num_tokens": 772358951.0, "reward": 1.4380581378936768, "reward_std": 0.3645114302635193, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19688211381435394, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1151.0201416015625, "completions/mean_terminated_length": 848.4567260742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2757444994939002, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1339283393085002, "kl": 0.018035888671875, "learning_rate": 9.182607075881966e-07, "loss": 0.1464, "num_tokens": 772947328.0, "reward": 1.3710938692092896, "reward_std": 0.3301179111003876, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8978794813156128, "rewards/tag_count_reward/std": 0.2377442717552185, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1066.6160888671875, "completions/mean_terminated_length": 850.016357421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27595759416120613, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11260364875495107, "kl": 0.018035888671875, "learning_rate": 9.180682123450229e-07, "loss": 0.0765, "num_tokens": 773499780.0, "reward": 1.3777902126312256, "reward_std": 0.3194967806339264, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.20309332013130188, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1103.009033203125, "completions/mean_terminated_length": 852.0791015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2761706888285121, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12013454198141268, "kl": 0.016998291015625, "learning_rate": 9.178755134042671e-07, "loss": 0.0907, "num_tokens": 774066232.0, "reward": 1.4285714626312256, "reward_std": 0.35301855206489563, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.20529504120349884, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1060.26123046875, "completions/mean_terminated_length": 838.9644775390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.276383783495818, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11987522861710993, "kl": 0.017822265625, "learning_rate": 9.176826108725728e-07, "loss": 0.0608, "num_tokens": 774608781.0, "reward": 1.4531251192092896, "reward_std": 0.32021352648735046, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.21881206333637238, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1005.0022583007812, "completions/mean_terminated_length": 834.329833984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.27659687816312395, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14466275889324853, "kl": 0.020233154296875, "learning_rate": 9.174895048566973e-07, "loss": 0.1145, "num_tokens": 775126510.0, "reward": 1.5039063692092896, "reward_std": 0.2775116264820099, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18025840818881989, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 893.5067138671875, "completions/mean_terminated_length": 748.4698486328125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2768099728304299, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1311021620055237, "kl": 0.0228271484375, "learning_rate": 9.172961954635099e-07, "loss": 0.0903, "num_tokens": 775593969.0, "reward": 1.4994419813156128, "reward_std": 0.3223113417625427, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.19004856050014496, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1022.2969360351562, "completions/mean_terminated_length": 816.0563354492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.27702306749773586, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13095663745198166, "kl": 0.017852783203125, "learning_rate": 9.171026827999922e-07, "loss": 0.0512, "num_tokens": 776119414.0, "reward": 1.4274554252624512, "reward_std": 0.28089243173599243, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.15616509318351746, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1020.169677734375, "completions/mean_terminated_length": 728.6074829101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2772361621650418, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10903291861117369, "kl": 0.019256591796875, "learning_rate": 9.16908966973239e-07, "loss": 0.0765, "num_tokens": 776649762.0, "reward": 1.4313616752624512, "reward_std": 0.3040112853050232, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.1997338831424713, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1112.2879638671875, "completions/mean_terminated_length": 825.8455200195312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2774492568323478, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12266434006344622, "kl": 0.0175018310546875, "learning_rate": 9.167150480904571e-07, "loss": 0.1108, "num_tokens": 777220851.0, "reward": 1.4754464626312256, "reward_std": 0.3249458372592926, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18152911961078644, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1051.6875, "completions/mean_terminated_length": 838.3848266601562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.27766235149965374, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12180991859009188, "kl": 0.016845703125, "learning_rate": 9.165209262589656e-07, "loss": 0.0644, "num_tokens": 777767719.0, "reward": 1.3811384439468384, "reward_std": 0.3671019971370697, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.1848943680524826, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 975.72998046875, "completions/mean_terminated_length": 790.4685668945312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2778754461669597, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13799097346756864, "kl": 0.02154541015625, "learning_rate": 9.163266015861963e-07, "loss": 0.061, "num_tokens": 778272014.0, "reward": 1.5267857313156128, "reward_std": 0.32575759291648865, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19273674488067627, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1047.388427734375, "completions/mean_terminated_length": 829.8641357421875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2780885408342656, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12117315708354938, "kl": 0.01800537109375, "learning_rate": 9.16132074179693e-07, "loss": 0.0829, "num_tokens": 778813820.0, "reward": 1.583147406578064, "reward_std": 0.34535449743270874, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.19325986504554749, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 997.2076416015625, "completions/mean_terminated_length": 789.2968139648438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.27830163550157155, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11696184323983488, "kl": 0.0206298828125, "learning_rate": 9.159373441471116e-07, "loss": 0.1028, "num_tokens": 779331481.0, "reward": 1.5647321939468384, "reward_std": 0.3234870433807373, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17605489492416382, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1053.6273193359375, "completions/mean_terminated_length": 834.1607666015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2785147301688775, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11333998857801916, "kl": 0.0174713134765625, "learning_rate": 9.157424115962202e-07, "loss": 0.0456, "num_tokens": 779870594.0, "reward": 1.4994419813156128, "reward_std": 0.29483819007873535, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.20140819251537323, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1079.087158203125, "completions/mean_terminated_length": 814.8380737304688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.27872782483618347, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1164108918532394, "kl": 0.017669677734375, "learning_rate": 9.155472766348993e-07, "loss": 0.0557, "num_tokens": 780423801.0, "reward": 1.4369419813156128, "reward_std": 0.35016340017318726, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.18019606173038483, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1081.5535888671875, "completions/mean_terminated_length": 877.8162231445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2789409195034894, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11959882336942596, "kl": 0.0157318115234375, "learning_rate": 9.15351939371141e-07, "loss": 0.1083, "num_tokens": 780980753.0, "reward": 1.5412946939468384, "reward_std": 0.35582858324050903, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18730680644512177, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1056.868408203125, "completions/mean_terminated_length": 844.6748046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2791540141707954, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.4947473902679029, "kl": 0.05352783203125, "learning_rate": 9.151563999130496e-07, "loss": 0.0708, "num_tokens": 781523846.0, "reward": 1.5541294813156128, "reward_std": 0.340303510427475, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19121423363685608, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1092.430908203125, "completions/mean_terminated_length": 852.203857421875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.27936710883810134, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12817383387377276, "kl": 0.0189208984375, "learning_rate": 9.149606583688413e-07, "loss": 0.1018, "num_tokens": 782076535.0, "reward": 1.4564732313156128, "reward_std": 0.36992743611335754, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.23239968717098236, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1121.919677734375, "completions/mean_terminated_length": 882.5955200195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.2795802035054073, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11178071436718626, "kl": 0.0167388916015625, "learning_rate": 9.147647148468437e-07, "loss": 0.0398, "num_tokens": 782646675.0, "reward": 1.317522406578064, "reward_std": 0.2902761995792389, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16430194675922394, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 933.4777221679688, "completions/mean_terminated_length": 764.43701171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2797932981727132, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1339476684640531, "kl": 0.021026611328125, "learning_rate": 9.14568569455497e-07, "loss": 0.0504, "num_tokens": 783136857.0, "reward": 1.4927456378936768, "reward_std": 0.300567626953125, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.13225844502449036, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1056.1607666015625, "completions/mean_terminated_length": 869.3687133789062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.28000639284001916, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1283274046980892, "kl": 0.019195556640625, "learning_rate": 9.143722223033523e-07, "loss": 0.0678, "num_tokens": 783676529.0, "reward": 1.469866156578064, "reward_std": 0.3264045715332031, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.19393454492092133, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1124.7254638671875, "completions/mean_terminated_length": 882.853515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2802194875073251, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11825637178520826, "kl": 0.0162811279296875, "learning_rate": 9.14175673499073e-07, "loss": 0.0313, "num_tokens": 784250262.0, "reward": 1.3911831378936768, "reward_std": 0.2963281571865082, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.201184943318367, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 943.0245971679688, "completions/mean_terminated_length": 765.5414428710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2804325821746311, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12996510470112538, "kl": 0.02069091796875, "learning_rate": 9.139789231514335e-07, "loss": 0.0444, "num_tokens": 784740193.0, "reward": 1.4626116752624512, "reward_std": 0.3077471852302551, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.18222151696681976, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 948.5938110351562, "completions/mean_terminated_length": 778.5824584960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28064567684193703, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11537986731300234, "kl": 0.019744873046875, "learning_rate": 9.137819713693204e-07, "loss": 0.0437, "num_tokens": 785232331.0, "reward": 1.5669643878936768, "reward_std": 0.3538995683193207, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1838252991437912, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1119.7076416015625, "completions/mean_terminated_length": 869.8838500976562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.280858771509243, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14324813712547055, "kl": 0.0158843994140625, "learning_rate": 9.135848182617314e-07, "loss": 0.0829, "num_tokens": 785807720.0, "reward": 1.3844866752624512, "reward_std": 0.30960574746131897, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23282569646835327, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1061.5201416015625, "completions/mean_terminated_length": 866.334228515625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.28107186617654895, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11645060112077799, "kl": 0.017913818359375, "learning_rate": 9.133874639377753e-07, "loss": 0.0662, "num_tokens": 786358817.0, "reward": 1.473772406578064, "reward_std": 0.2898625433444977, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.1829940527677536, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1057.212158203125, "completions/mean_terminated_length": 811.5849609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2812849608438549, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12767703266349847, "kl": 0.01812744140625, "learning_rate": 9.13189908506673e-07, "loss": 0.0503, "num_tokens": 786899984.0, "reward": 1.3063616752624512, "reward_std": 0.33003297448158264, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19617067277431488, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1070.49560546875, "completions/mean_terminated_length": 793.209228515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28149805551116086, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13722848041616018, "kl": 0.0153656005859375, "learning_rate": 9.12992152077756e-07, "loss": 0.0861, "num_tokens": 787447246.0, "reward": 1.5094866752624512, "reward_std": 0.3423069715499878, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21863441169261932, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1105.0491943359375, "completions/mean_terminated_length": 903.1707763671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28171115017846676, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11767120535812368, "kl": 0.01885986328125, "learning_rate": 9.127941947604676e-07, "loss": 0.0646, "num_tokens": 788009348.0, "reward": 1.5563616752624512, "reward_std": 0.3679960370063782, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18297357857227325, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1181.185302734375, "completions/mean_terminated_length": 905.8441162109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2819242448457727, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11667056771683709, "kl": 0.018402099609375, "learning_rate": 9.12596036664362e-07, "loss": 0.1007, "num_tokens": 788600551.0, "reward": 1.3309152126312256, "reward_std": 0.3525812327861786, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.2226571887731552, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1099.384033203125, "completions/mean_terminated_length": 886.8524169921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2821373395130787, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11243789580256533, "kl": 0.0184326171875, "learning_rate": 9.123976778991045e-07, "loss": 0.0553, "num_tokens": 789166339.0, "reward": 1.4453126192092896, "reward_std": 0.4101276695728302, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.2030172199010849, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1133.732177734375, "completions/mean_terminated_length": 821.6766967773438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.28235043418038464, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13040762072720444, "kl": 0.0167694091796875, "learning_rate": 9.121991185744713e-07, "loss": 0.1286, "num_tokens": 789744491.0, "reward": 1.3755581378936768, "reward_std": 0.34969362616539, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.22809666395187378, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1135.482177734375, "completions/mean_terminated_length": 886.6136474609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2825635288476906, "frac_reward_zero_std": 0.0, "grad_norm": 0.12475266511159067, "kl": 0.017303466796875, "learning_rate": 9.120003588003499e-07, "loss": 0.0992, "num_tokens": 790326051.0, "reward": 1.4079241752624512, "reward_std": 0.418332040309906, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.22078222036361694, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1067.5357666015625, "completions/mean_terminated_length": 837.950439453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28277662351499655, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13121360041275573, "kl": 0.01800537109375, "learning_rate": 9.118013986867389e-07, "loss": 0.0454, "num_tokens": 790880259.0, "reward": 1.4335938692092896, "reward_std": 0.39410674571990967, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21280090510845184, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1007.6964721679688, "completions/mean_terminated_length": 837.4649047851562, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2829897181823025, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13209376764465297, "kl": 0.01959228515625, "learning_rate": 9.116022383437472e-07, "loss": 0.0951, "num_tokens": 791399947.0, "reward": 1.5345982313156128, "reward_std": 0.36471685767173767, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.19029554724693298, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1021.5982666015625, "completions/mean_terminated_length": 831.5238037109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.28320281284960847, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12199474478311033, "kl": 0.01953125, "learning_rate": 9.114028778815947e-07, "loss": 0.0781, "num_tokens": 791924935.0, "reward": 1.5295759439468384, "reward_std": 0.30135899782180786, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17181210219860077, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1115.4888916015625, "completions/mean_terminated_length": 900.2940063476562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.28341590751691437, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1286730240094993, "kl": 0.01953125, "learning_rate": 9.112033174106124e-07, "loss": 0.1351, "num_tokens": 792492258.0, "reward": 1.4414063692092896, "reward_std": 0.41261225938796997, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.22844666242599487, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 933.3214721679688, "completions/mean_terminated_length": 767.5487670898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2836290021842203, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14463917669042317, "kl": 0.020843505859375, "learning_rate": 9.110035570412417e-07, "loss": 0.0711, "num_tokens": 792985346.0, "reward": 1.4760044813156128, "reward_std": 0.3053831160068512, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.15989671647548676, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1035.591552734375, "completions/mean_terminated_length": 812.1444091796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2838420968515263, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13670847931225974, "kl": 0.02178955078125, "learning_rate": 9.108035968840348e-07, "loss": 0.1155, "num_tokens": 793516571.0, "reward": 1.4888393878936768, "reward_std": 0.3700525462627411, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.22866584360599518, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1162.2076416015625, "completions/mean_terminated_length": 891.0466918945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28405519151883224, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12082535966206552, "kl": 0.0195770263671875, "learning_rate": 9.10603437049654e-07, "loss": 0.1001, "num_tokens": 794106872.0, "reward": 1.3783482313156128, "reward_std": 0.3501928150653839, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9185267686843872, "rewards/tag_count_reward/std": 0.21232296526432037, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1039.462158203125, "completions/mean_terminated_length": 871.3724365234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2842682861861382, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1430705814576282, "kl": 0.019134521484375, "learning_rate": 9.104030776488727e-07, "loss": 0.1336, "num_tokens": 794636743.0, "reward": 1.5133929252624512, "reward_std": 0.34940335154533386, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.2073281854391098, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1213.9732666015625, "completions/mean_terminated_length": 932.644775390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.28448138085344415, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11293167541330806, "kl": 0.016754150390625, "learning_rate": 9.102025187925742e-07, "loss": 0.0494, "num_tokens": 795251147.0, "reward": 1.301897406578064, "reward_std": 0.348479300737381, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.22180895507335663, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1067.22998046875, "completions/mean_terminated_length": 824.0863647460938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2846944755207501, "frac_reward_zero_std": 0.0, "grad_norm": 0.12420104375672326, "kl": 0.021087646484375, "learning_rate": 9.10001760591753e-07, "loss": 0.0662, "num_tokens": 795799826.0, "reward": 1.5239956378936768, "reward_std": 0.3567754924297333, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.18406878411769867, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1122.009033203125, "completions/mean_terminated_length": 914.5464477539062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.28490757018805607, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11326181532397314, "kl": 0.01788330078125, "learning_rate": 9.098008031575131e-07, "loss": 0.0757, "num_tokens": 796370134.0, "reward": 1.4095982313156128, "reward_std": 0.3356040120124817, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.20478054881095886, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1005.8660888671875, "completions/mean_terminated_length": 765.3736572265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28512066485536197, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13537729933950773, "kl": 0.021392822265625, "learning_rate": 9.095996466010689e-07, "loss": 0.0988, "num_tokens": 796885002.0, "reward": 1.5161831378936768, "reward_std": 0.3343140780925751, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.22180894017219543, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1077.399658203125, "completions/mean_terminated_length": 863.1798095703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28533375952266793, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1352687549697869, "kl": 0.0177001953125, "learning_rate": 9.093982910337454e-07, "loss": 0.0957, "num_tokens": 797434525.0, "reward": 1.4469866752624512, "reward_std": 0.3665366768836975, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.20797671377658844, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1164.977783203125, "completions/mean_terminated_length": 920.9515380859375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2855468541899739, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11662790671178405, "kl": 0.016326904296875, "learning_rate": 9.091967365669774e-07, "loss": 0.0321, "num_tokens": 798024867.0, "reward": 1.4274554252624512, "reward_std": 0.3158789873123169, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.17893508076667786, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1116.8773193359375, "completions/mean_terminated_length": 866.291748046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.28575994885727984, "frac_reward_zero_std": 0.0, "grad_norm": 0.12843343009270597, "kl": 0.0170440673828125, "learning_rate": 9.089949833123098e-07, "loss": 0.088, "num_tokens": 798601596.0, "reward": 1.4291294813156128, "reward_std": 0.4090938866138458, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.23006939888000488, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1070.0648193359375, "completions/mean_terminated_length": 854.2261352539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2859730435245858, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12141747425550446, "kl": 0.01885986328125, "learning_rate": 9.087930313813977e-07, "loss": 0.0601, "num_tokens": 799156825.0, "reward": 1.3119419813156128, "reward_std": 0.2898409068584442, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.19648860394954681, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 961.1473388671875, "completions/mean_terminated_length": 789.8346557617188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.28618613819189176, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12548457007195213, "kl": 0.019561767578125, "learning_rate": 9.08590880886006e-07, "loss": 0.0561, "num_tokens": 799657051.0, "reward": 1.5351563692092896, "reward_std": 0.35372021794319153, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.20974001288414001, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1017.7098388671875, "completions/mean_terminated_length": 870.5255126953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2863992328591977, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1262389678285119, "kl": 0.0179443359375, "learning_rate": 9.083885319380095e-07, "loss": 0.059, "num_tokens": 800182633.0, "reward": 1.5106027126312256, "reward_std": 0.33969682455062866, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.19061264395713806, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1118.703125, "completions/mean_terminated_length": 897.9309692382812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2866123275265037, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12448776595543022, "kl": 0.017852783203125, "learning_rate": 9.08185984649393e-07, "loss": 0.115, "num_tokens": 800755572.0, "reward": 1.4335938692092896, "reward_std": 0.37306830286979675, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.2255041003227234, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1066.2366943359375, "completions/mean_terminated_length": 842.986328125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2868254221938096, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12429465381767626, "kl": 0.017669677734375, "learning_rate": 9.079832391322506e-07, "loss": 0.0796, "num_tokens": 801302222.0, "reward": 1.4051339626312256, "reward_std": 0.3277166485786438, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.2131679654121399, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 905.7835083007812, "completions/mean_terminated_length": 745.9312744140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.28703851686111553, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1236832228664559, "kl": 0.019989013671875, "learning_rate": 9.077802954987868e-07, "loss": 0.005, "num_tokens": 801775261.0, "reward": 1.5407366752624512, "reward_std": 0.2504132390022278, "rewards/accuracy_reward/mean": 0.6466346383094788, "rewards/accuracy_reward/std": 0.4785905182361603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.17596179246902466, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1054.0045166015625, "completions/mean_terminated_length": 873.0396118164062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2872516115284215, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12909507788107594, "kl": 0.019622802734375, "learning_rate": 9.07577153861315e-07, "loss": 0.1357, "num_tokens": 802318431.0, "reward": 1.5161831378936768, "reward_std": 0.3458826243877411, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8889508843421936, "rewards/tag_count_reward/std": 0.25267162919044495, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 916.2254638671875, "completions/mean_terminated_length": 777.235595703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28746470619572745, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14159079616731926, "kl": 0.020599365234375, "learning_rate": 9.073738143322589e-07, "loss": 0.0638, "num_tokens": 802790516.0, "reward": 1.5887277126312256, "reward_std": 0.3157961368560791, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18257060647010803, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 943.5245971679688, "completions/mean_terminated_length": 807.88720703125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2876778008630334, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14104431899326125, "kl": 0.022491455078125, "learning_rate": 9.071702770241512e-07, "loss": 0.0503, "num_tokens": 803284319.0, "reward": 1.5664063692092896, "reward_std": 0.2470361292362213, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.17317995429039001, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1070.6920166015625, "completions/mean_terminated_length": 854.9918212890625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.28789089553033936, "frac_reward_zero_std": 0.0, "grad_norm": 0.13545160650419127, "kl": 0.018280029296875, "learning_rate": 9.069665420496341e-07, "loss": 0.0867, "num_tokens": 803831637.0, "reward": 1.4587054252624512, "reward_std": 0.3756580948829651, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.20986275374889374, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 954.6629638671875, "completions/mean_terminated_length": 772.4401245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2881039901976453, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13007115145869658, "kl": 0.0203857421875, "learning_rate": 9.067626095214596e-07, "loss": 0.0847, "num_tokens": 804320430.0, "reward": 1.5535714626312256, "reward_std": 0.334953248500824, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18918029963970184, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1087.0848388671875, "completions/mean_terminated_length": 858.8011474609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2883170848649513, "frac_reward_zero_std": 0.0, "grad_norm": 0.12294409359863658, "kl": 0.0164794921875, "learning_rate": 9.065584795524884e-07, "loss": 0.1207, "num_tokens": 804877348.0, "reward": 1.4704241752624512, "reward_std": 0.37603843212127686, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.1959095597267151, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1085.5223388671875, "completions/mean_terminated_length": 888.8870849609375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2885301795322572, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12061608385179286, "kl": 0.0160369873046875, "learning_rate": 9.06354152255691e-07, "loss": 0.0561, "num_tokens": 805430622.0, "reward": 1.3744419813156128, "reward_std": 0.3371695280075073, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960493743419647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1922689974308014, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 995.87060546875, "completions/mean_terminated_length": 823.703857421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28874327419956314, "frac_reward_zero_std": 0.0, "grad_norm": 0.13305217535225422, "kl": 0.023223876953125, "learning_rate": 9.06149627744147e-07, "loss": 0.0669, "num_tokens": 805941076.0, "reward": 1.5892857313156128, "reward_std": 0.34248897433280945, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.16685131192207336, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1125.46875, "completions/mean_terminated_length": 887.0618286132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2889563688668691, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13760718076730524, "kl": 0.01983642578125, "learning_rate": 9.059449061310451e-07, "loss": 0.0732, "num_tokens": 806506358.0, "reward": 1.25, "reward_std": 0.31314703822135925, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839529275894165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.20789341628551483, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1028.03125, "completions/mean_terminated_length": 809.6640014648438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28916946353417505, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11335865602688269, "kl": 0.0175323486328125, "learning_rate": 9.057399875296827e-07, "loss": 0.037, "num_tokens": 807038260.0, "reward": 1.3035714626312256, "reward_std": 0.3286227583885193, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.19061346352100372, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1048.4888916015625, "completions/mean_terminated_length": 872.7218017578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.289382558201481, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12460983050952815, "kl": 0.0170440673828125, "learning_rate": 9.055348720534668e-07, "loss": 0.0322, "num_tokens": 807577823.0, "reward": 1.3984376192092896, "reward_std": 0.31667280197143555, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16746000945568085, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1135.9442138671875, "completions/mean_terminated_length": 839.121337890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.28959565286878697, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11633832651189803, "kl": 0.01690673828125, "learning_rate": 9.053295598159133e-07, "loss": 0.0759, "num_tokens": 808158118.0, "reward": 1.3203125, "reward_std": 0.3191813826560974, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.2081664800643921, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1106.80810546875, "completions/mean_terminated_length": 846.70654296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2898087475360929, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12601665168675036, "kl": 0.0165863037109375, "learning_rate": 9.051240509306463e-07, "loss": 0.0793, "num_tokens": 808725488.0, "reward": 1.4017857313156128, "reward_std": 0.3301330804824829, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1010.7053833007812, "completions/mean_terminated_length": 778.3059692382812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2900218422033989, "frac_reward_zero_std": 0.25, "grad_norm": 0.11951829546230916, "kl": 0.017578125, "learning_rate": 9.049183455113998e-07, "loss": 0.0637, "num_tokens": 809252044.0, "reward": 1.32421875, "reward_std": 0.26720666885375977, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.1865345686674118, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 959.8281860351562, "completions/mean_terminated_length": 816.9368896484375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2902349368707048, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1410473298302068, "kl": 0.0213623046875, "learning_rate": 9.047124436720155e-07, "loss": 0.0925, "num_tokens": 809752079.0, "reward": 1.5686384439468384, "reward_std": 0.29772481322288513, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.162506565451622, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1069.3817138671875, "completions/mean_terminated_length": 846.8466186523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29044803153801074, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12894771585055406, "kl": 0.019378662109375, "learning_rate": 9.045063455264447e-07, "loss": 0.092, "num_tokens": 810298330.0, "reward": 1.4051339626312256, "reward_std": 0.3721138536930084, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.21542829275131226, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1070.0648193359375, "completions/mean_terminated_length": 827.6239624023438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2906611262053167, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11979578056277823, "kl": 0.018280029296875, "learning_rate": 9.043000511887467e-07, "loss": 0.0496, "num_tokens": 810846599.0, "reward": 1.4352679252624512, "reward_std": 0.3696146309375763, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.21985933184623718, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1000.7835083007812, "completions/mean_terminated_length": 780.0189208984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.29087422087262266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16189542852984404, "kl": 0.018585205078125, "learning_rate": 9.040935607730899e-07, "loss": 0.0766, "num_tokens": 811366694.0, "reward": 1.4553571939468384, "reward_std": 0.31917306780815125, "rewards/accuracy_reward/mean": 0.5162037014961243, "rewards/accuracy_reward/std": 0.5003167986869812, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15736359357833862, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1137.109375, "completions/mean_terminated_length": 904.9215698242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2910873155399286, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10998254603439232, "kl": 0.0157012939453125, "learning_rate": 9.038868743937505e-07, "loss": 0.0649, "num_tokens": 811941687.0, "reward": 1.477678656578064, "reward_std": 0.32315024733543396, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.17503081262111664, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1096.493408203125, "completions/mean_terminated_length": 892.783203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2913004102072346, "frac_reward_zero_std": 0.0, "grad_norm": 0.12337981894192616, "kl": 0.018585205078125, "learning_rate": 9.036799921651141e-07, "loss": 0.1296, "num_tokens": 812502964.0, "reward": 1.3649554252624512, "reward_std": 0.36768028140068054, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.20240136981010437, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1118.01123046875, "completions/mean_terminated_length": 864.3778686523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29151350487454053, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12029299841815014, "kl": 0.017425537109375, "learning_rate": 9.034729142016739e-07, "loss": 0.1162, "num_tokens": 813074409.0, "reward": 1.5000001192092896, "reward_std": 0.4060979187488556, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18240727484226227, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 902.825927734375, "completions/mean_terminated_length": 742.559814453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2917265995418465, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13859788910943882, "kl": 0.02142333984375, "learning_rate": 9.032656406180317e-07, "loss": 0.0852, "num_tokens": 813536459.0, "reward": 1.6439732313156128, "reward_std": 0.34982573986053467, "rewards/accuracy_reward/mean": 0.7053571343421936, "rewards/accuracy_reward/std": 0.45639166235923767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.19608551263809204, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 887.6563110351562, "completions/mean_terminated_length": 738.5944213867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2919396942091524, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14003256928640812, "kl": 0.0211181640625, "learning_rate": 9.030581715288976e-07, "loss": 0.1178, "num_tokens": 813998193.0, "reward": 1.5027902126312256, "reward_std": 0.3440830409526825, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20504480600357056, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1045.12060546875, "completions/mean_terminated_length": 840.231201171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.29215278887645835, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12547713190171392, "kl": 0.017303466796875, "learning_rate": 9.028505070490898e-07, "loss": 0.0751, "num_tokens": 814530551.0, "reward": 1.505022406578064, "reward_std": 0.3037368059158325, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16576191782951355, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1049.3817138671875, "completions/mean_terminated_length": 848.587158203125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2923658835437643, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13948451405051082, "kl": 0.020416259765625, "learning_rate": 9.026426472935348e-07, "loss": 0.1093, "num_tokens": 815063842.0, "reward": 1.5334821939468384, "reward_std": 0.35994020104408264, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.2065439224243164, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1155.779052734375, "completions/mean_terminated_length": 875.8152465820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29257897821107026, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12391504834502774, "kl": 0.0147705078125, "learning_rate": 9.024345923772671e-07, "loss": 0.1135, "num_tokens": 815660911.0, "reward": 1.2840402126312256, "reward_std": 0.36466774344444275, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23758143186569214, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1208.359375, "completions/mean_terminated_length": 883.4210205078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2927920728783762, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11496203230591968, "kl": 0.0152435302734375, "learning_rate": 9.02226342415429e-07, "loss": 0.0834, "num_tokens": 816283776.0, "reward": 1.3515626192092896, "reward_std": 0.38633763790130615, "rewards/accuracy_reward/mean": 0.46759259700775146, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9006696343421936, "rewards/tag_count_reward/std": 0.24499371647834778, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 922.7210083007812, "completions/mean_terminated_length": 752.048828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2930051675456822, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13564816184001682, "kl": 0.021087646484375, "learning_rate": 9.020178975232709e-07, "loss": 0.1157, "num_tokens": 816761603.0, "reward": 1.4810268878936768, "reward_std": 0.3346594572067261, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.19318638741970062, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1024.165283203125, "completions/mean_terminated_length": 794.7813720703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29321826221298813, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16033672967602017, "kl": 0.02008056640625, "learning_rate": 9.018092578161514e-07, "loss": 0.1145, "num_tokens": 817291741.0, "reward": 1.4196429252624512, "reward_std": 0.3855610191822052, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.211996391415596, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1182.884033203125, "completions/mean_terminated_length": 911.4252319335938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2934313568802941, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11748575013782174, "kl": 0.0154876708984375, "learning_rate": 9.016004234095362e-07, "loss": 0.0866, "num_tokens": 817898969.0, "reward": 1.3543527126312256, "reward_std": 0.39249926805496216, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.23329170048236847, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1018.12060546875, "completions/mean_terminated_length": 827.402099609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2936444515476, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12969368088868707, "kl": 0.0202178955078125, "learning_rate": 9.013913944189994e-07, "loss": 0.1255, "num_tokens": 818425823.0, "reward": 1.5117188692092896, "reward_std": 0.31404152512550354, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.1926516890525818, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 942.6808471679688, "completions/mean_terminated_length": 744.8869018554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29385754621490595, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12554022741685236, "kl": 0.017333984375, "learning_rate": 9.011821709602227e-07, "loss": 0.1146, "num_tokens": 818912208.0, "reward": 1.5513393878936768, "reward_std": 0.3249755799770355, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.19118894636631012, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1089.977783203125, "completions/mean_terminated_length": 835.5875854492188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2940706408822119, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11654853424961242, "kl": 0.0165252685546875, "learning_rate": 9.009727531489949e-07, "loss": 0.0914, "num_tokens": 819476022.0, "reward": 1.4866071939468384, "reward_std": 0.39225301146507263, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20332752168178558, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 881.82373046875, "completions/mean_terminated_length": 728.6893920898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29428373554951787, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14870577540017033, "kl": 0.0216064453125, "learning_rate": 9.007631411012129e-07, "loss": 0.0785, "num_tokens": 819935543.0, "reward": 1.5290179252624512, "reward_std": 0.3149246871471405, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.1934608370065689, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1035.859375, "completions/mean_terminated_length": 815.8287963867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2944968302168238, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1279863416971706, "kl": 0.0184326171875, "learning_rate": 9.00553334932881e-07, "loss": 0.1036, "num_tokens": 820472856.0, "reward": 1.4447544813156128, "reward_std": 0.34381943941116333, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21214282512664795, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1069.825927734375, "completions/mean_terminated_length": 863.6162109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2947099248841298, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12909815590518994, "kl": 0.0171661376953125, "learning_rate": 9.003433347601108e-07, "loss": 0.1012, "num_tokens": 821022554.0, "reward": 1.3755581378936768, "reward_std": 0.3863195776939392, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.22320599853992462, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1051.138427734375, "completions/mean_terminated_length": 775.6524047851562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.29492301955143574, "frac_reward_zero_std": 0.0, "grad_norm": 0.33523925290709916, "kl": 0.01800537109375, "learning_rate": 9.001331406991212e-07, "loss": 0.092, "num_tokens": 821567704.0, "reward": 1.3169643878936768, "reward_std": 0.32823944091796875, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.22093555331230164, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 992.8214721679688, "completions/mean_terminated_length": 770.37841796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2951361142187417, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14786096024583684, "kl": 0.021942138671875, "learning_rate": 8.999227528662388e-07, "loss": 0.0938, "num_tokens": 822087192.0, "reward": 1.5351563692092896, "reward_std": 0.31625211238861084, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17932796478271484, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1051.263427734375, "completions/mean_terminated_length": 857.2319946289062, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.29534920888604765, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11995310838117135, "kl": 0.01861572265625, "learning_rate": 8.997121713778968e-07, "loss": 0.0878, "num_tokens": 822625646.0, "reward": 1.4575893878936768, "reward_std": 0.29564735293388367, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18991795182228088, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1031.4129638671875, "completions/mean_terminated_length": 807.0435791015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29556230355335356, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12389574815147643, "kl": 0.01812744140625, "learning_rate": 8.995013963506362e-07, "loss": 0.1128, "num_tokens": 823161543.0, "reward": 1.536272406578064, "reward_std": 0.3453434109687805, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.19750888645648956, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1004.1406860351562, "completions/mean_terminated_length": 814.09765625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2957753982206595, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 1.1377261640978062, "kl": 0.08575439453125, "learning_rate": 8.992904279011048e-07, "loss": 0.1233, "num_tokens": 823686262.0, "reward": 1.5055804252624512, "reward_std": 0.3839109539985657, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.23246414959430695, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 982.5379638671875, "completions/mean_terminated_length": 848.6859130859375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.29598849288796547, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1296868288252111, "kl": 0.01849365234375, "learning_rate": 8.990792661460575e-07, "loss": 0.1026, "num_tokens": 824187191.0, "reward": 1.6205357313156128, "reward_std": 0.3464025855064392, "rewards/accuracy_reward/mean": 0.6785714030265808, "rewards/accuracy_reward/std": 0.4675469696521759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.16965599358081818, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1049.513427734375, "completions/mean_terminated_length": 822.4602661132812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.29620158755527143, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12193831873142236, "kl": 0.016998291015625, "learning_rate": 8.98867911202356e-07, "loss": 0.074, "num_tokens": 824729789.0, "reward": 1.3967634439468384, "reward_std": 0.34617260098457336, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.2049778252840042, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 993.6027221679688, "completions/mean_terminated_length": 833.6812133789062, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2964146822225774, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13133356104270477, "kl": 0.01776123046875, "learning_rate": 8.986563631869693e-07, "loss": 0.0919, "num_tokens": 825244571.0, "reward": 1.4469866752624512, "reward_std": 0.36387813091278076, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19768576323986053, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1032.3192138671875, "completions/mean_terminated_length": 801.356201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29662777688988334, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13786845038553006, "kl": 0.0186767578125, "learning_rate": 8.984446222169729e-07, "loss": 0.1207, "num_tokens": 825776954.0, "reward": 1.3431919813156128, "reward_std": 0.3439805209636688, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19445767998695374, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1010.3303833007812, "completions/mean_terminated_length": 788.1734619140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2968408715571893, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11964439136483068, "kl": 0.017242431640625, "learning_rate": 8.982326884095492e-07, "loss": 0.0723, "num_tokens": 826298686.0, "reward": 1.4732143878936768, "reward_std": 0.29365402460098267, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19774970412254333, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1014.1272583007812, "completions/mean_terminated_length": 806.2440185546875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.29705396622449526, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13063972878830846, "kl": 0.0179443359375, "learning_rate": 8.980205618819877e-07, "loss": 0.0632, "num_tokens": 826820199.0, "reward": 1.4916294813156128, "reward_std": 0.34952548146247864, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.15956845879554749, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1100.560302734375, "completions/mean_terminated_length": 821.2572021484375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.29726706089180116, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1983475992856638, "kl": 0.019012451171875, "learning_rate": 8.978082427516837e-07, "loss": 0.1442, "num_tokens": 827387810.0, "reward": 1.4536831378936768, "reward_std": 0.381161093711853, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.22741149365901947, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1134.140625, "completions/mean_terminated_length": 884.90625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2974801555591071, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11928166269437718, "kl": 0.0169830322265625, "learning_rate": 8.975957311361398e-07, "loss": 0.0339, "num_tokens": 827967185.0, "reward": 1.3571429252624512, "reward_std": 0.36671510338783264, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1724584847688675, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 988.3772583007812, "completions/mean_terminated_length": 758.0244750976562, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2976932502264131, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12996950343475697, "kl": 0.0191650390625, "learning_rate": 8.973830271529649e-07, "loss": 0.1172, "num_tokens": 828478618.0, "reward": 1.4899554252624512, "reward_std": 0.28695932030677795, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.16277234256267548, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 990.5692138671875, "completions/mean_terminated_length": 807.8717651367188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29790634489371903, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11925011063101904, "kl": 0.0163421630859375, "learning_rate": 8.971701309198742e-07, "loss": 0.0554, "num_tokens": 828993209.0, "reward": 1.5580357313156128, "reward_std": 0.3501366376876831, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15220165252685547, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1007.450927734375, "completions/mean_terminated_length": 827.670166015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.298119439561025, "frac_reward_zero_std": 0.0, "grad_norm": 0.13017080383554278, "kl": 0.019744873046875, "learning_rate": 8.9695704255469e-07, "loss": 0.1083, "num_tokens": 829518163.0, "reward": 1.5178571939468384, "reward_std": 0.37711983919143677, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1928403526544571, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1030.9442138671875, "completions/mean_terminated_length": 771.6947021484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29833253422833095, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16500231690753925, "kl": 0.0185546875, "learning_rate": 8.967437621753398e-07, "loss": 0.1594, "num_tokens": 830046042.0, "reward": 1.4369419813156128, "reward_std": 0.35246893763542175, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19662198424339294, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 875.685302734375, "completions/mean_terminated_length": 748.0073852539062, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2985456288956369, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13329389179295378, "kl": 0.020660400390625, "learning_rate": 8.965302898998581e-07, "loss": 0.0713, "num_tokens": 830506173.0, "reward": 1.5429688692092896, "reward_std": 0.3309931755065918, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.17006663978099823, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 957.9777221679688, "completions/mean_terminated_length": 789.41748046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.29875872356294286, "frac_reward_zero_std": 0.0, "grad_norm": 0.14822042720898254, "kl": 0.019866943359375, "learning_rate": 8.963166258463859e-07, "loss": 0.0955, "num_tokens": 831006099.0, "reward": 1.5239956378936768, "reward_std": 0.3759874403476715, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17854657769203186, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 969.15185546875, "completions/mean_terminated_length": 772.7388305664062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.29897181823024876, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.17107355009678538, "kl": 0.02093505859375, "learning_rate": 8.961027701331693e-07, "loss": 0.1572, "num_tokens": 831504551.0, "reward": 1.3850446939468384, "reward_std": 0.2944815158843994, "rewards/accuracy_reward/mean": 0.46990740299224854, "rewards/accuracy_reward/std": 0.4996722638607025, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.19884200394153595, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 999.8348388671875, "completions/mean_terminated_length": 802.4349975585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2991849128975547, "frac_reward_zero_std": 0.0, "grad_norm": 0.38181681629980196, "kl": 0.022247314453125, "learning_rate": 8.958887228785615e-07, "loss": 0.0852, "num_tokens": 832024333.0, "reward": 1.3325893878936768, "reward_std": 0.387983500957489, "rewards/accuracy_reward/mean": 0.42824074625968933, "rewards/accuracy_reward/std": 0.4953974783420563, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.2067493349313736, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1125.2366943359375, "completions/mean_terminated_length": 856.6512451171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2993980075648607, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11975598424664902, "kl": 0.016448974609375, "learning_rate": 8.95674484201021e-07, "loss": 0.1086, "num_tokens": 832595943.0, "reward": 1.3883929252624512, "reward_std": 0.3373308479785919, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.2012539505958557, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 945.3638916015625, "completions/mean_terminated_length": 778.1259765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.29961110223216664, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13186394636310558, "kl": 0.0201416015625, "learning_rate": 8.954600542191128e-07, "loss": 0.0659, "num_tokens": 833087578.0, "reward": 1.5033482313156128, "reward_std": 0.36830461025238037, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1880783587694168, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 976.5469360351562, "completions/mean_terminated_length": 736.4945068359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2998241968994726, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12730494152164304, "kl": 0.018829345703125, "learning_rate": 8.952454330515072e-07, "loss": 0.0926, "num_tokens": 833593775.0, "reward": 1.4575893878936768, "reward_std": 0.33134713768959045, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1884397715330124, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1073.37060546875, "completions/mean_terminated_length": 874.252685546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30003729156677855, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12345956694277439, "kl": 0.01751708984375, "learning_rate": 8.950306208169805e-07, "loss": 0.0634, "num_tokens": 834140805.0, "reward": 1.3616071939468384, "reward_std": 0.35382047295570374, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.18987849354743958, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1102.6741943359375, "completions/mean_terminated_length": 881.3168334960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3002503862340845, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11783133569107644, "kl": 0.0163421630859375, "learning_rate": 8.948156176344154e-07, "loss": 0.0644, "num_tokens": 834703107.0, "reward": 1.3426339626312256, "reward_std": 0.37391969561576843, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20146164298057556, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1113.5513916015625, "completions/mean_terminated_length": 809.4408569335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.30046348090139047, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11731171392788947, "kl": 0.0167999267578125, "learning_rate": 8.94600423622799e-07, "loss": 0.0591, "num_tokens": 835272682.0, "reward": 1.3945313692092896, "reward_std": 0.39762619137763977, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.221673846244812, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1082.513427734375, "completions/mean_terminated_length": 878.9783935546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30067657556869637, "frac_reward_zero_std": 0.0, "grad_norm": 0.10843301400402108, "kl": 0.017425537109375, "learning_rate": 8.943850389012252e-07, "loss": 0.0474, "num_tokens": 835822624.0, "reward": 1.489397406578064, "reward_std": 0.36612966656684875, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.18964743614196777, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1047.0513916015625, "completions/mean_terminated_length": 855.3803100585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3008896702360023, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12435930285702078, "kl": 0.018768310546875, "learning_rate": 8.941694635888928e-07, "loss": 0.1483, "num_tokens": 836357031.0, "reward": 1.399553656578064, "reward_std": 0.3439539968967438, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.21858371794223785, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1026.9554443359375, "completions/mean_terminated_length": 824.9304809570312, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3011027649033083, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.127661821463488, "kl": 0.01885986328125, "learning_rate": 8.939536978051062e-07, "loss": 0.0736, "num_tokens": 836896739.0, "reward": 1.5228794813156128, "reward_std": 0.3001701533794403, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17181210219860077, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1002.4710083007812, "completions/mean_terminated_length": 825.0313720703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.30131585957061424, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13035861436231824, "kl": 0.0194091796875, "learning_rate": 8.937377416692752e-07, "loss": 0.0618, "num_tokens": 837417414.0, "reward": 1.5223214626312256, "reward_std": 0.27997490763664246, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17525888979434967, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1007.7388916015625, "completions/mean_terminated_length": 824.8057861328125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.3015289542379202, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13182614690024136, "kl": 0.02020263671875, "learning_rate": 8.935215953009151e-07, "loss": 0.085, "num_tokens": 837931617.0, "reward": 1.4313616752624512, "reward_std": 0.2713320255279541, "rewards/accuracy_reward/mean": 0.5231481194496155, "rewards/accuracy_reward/std": 0.5000429749488831, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.19669181108474731, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1089.71435546875, "completions/mean_terminated_length": 817.8796997070312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30174204890522616, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14171619584956388, "kl": 0.019561767578125, "learning_rate": 8.933052588196464e-07, "loss": 0.0978, "num_tokens": 838497617.0, "reward": 1.2600446939468384, "reward_std": 0.3605620563030243, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.21962924301624298, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 986.1897583007812, "completions/mean_terminated_length": 786.2201538085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3019551435725321, "frac_reward_zero_std": 0.0, "grad_norm": 0.11813334522327122, "kl": 0.020294189453125, "learning_rate": 8.930887323451947e-07, "loss": 0.0675, "num_tokens": 839009142.0, "reward": 1.4302456378936768, "reward_std": 0.40599361062049866, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.22104217112064362, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1133.3326416015625, "completions/mean_terminated_length": 883.8778686523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30216823823983807, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13537016539897972, "kl": 0.017669677734375, "learning_rate": 8.928720159973908e-07, "loss": 0.1098, "num_tokens": 839592987.0, "reward": 1.3348214626312256, "reward_std": 0.38245120644569397, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9017857313156128, "rewards/tag_count_reward/std": 0.23909583687782288, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1329.9866943359375, "completions/mean_terminated_length": 996.7908325195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.302381332907144, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11230467899749877, "kl": 0.013580322265625, "learning_rate": 8.926551098961708e-07, "loss": 0.0544, "num_tokens": 840263525.0, "reward": 1.2991071939468384, "reward_std": 0.38015004992485046, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21875499188899994, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1071.84375, "completions/mean_terminated_length": 802.0797729492188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.30259442757444993, "frac_reward_zero_std": 0.0, "grad_norm": 0.14000220224572366, "kl": 0.02001953125, "learning_rate": 8.924380141615753e-07, "loss": 0.0884, "num_tokens": 840819663.0, "reward": 1.2310268878936768, "reward_std": 0.3986087143421173, "rewards/accuracy_reward/mean": 0.35879629850387573, "rewards/accuracy_reward/std": 0.48020341992378235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8850446343421936, "rewards/tag_count_reward/std": 0.24839438498020172, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1129.169677734375, "completions/mean_terminated_length": 871.8971557617188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3028075222417559, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12882822493613516, "kl": 0.0179443359375, "learning_rate": 8.922207289137504e-07, "loss": 0.0871, "num_tokens": 841398331.0, "reward": 1.3470982313156128, "reward_std": 0.3199007511138916, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.2269863337278366, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1031.1473388671875, "completions/mean_terminated_length": 833.2000122070312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.30302061690906185, "frac_reward_zero_std": 0.0, "grad_norm": 0.13745037336854254, "kl": 0.01898193359375, "learning_rate": 8.920032542729468e-07, "loss": 0.1115, "num_tokens": 841929501.0, "reward": 1.5027902126312256, "reward_std": 0.4292661249637604, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.23853585124015808, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 981.5625610351562, "completions/mean_terminated_length": 826.09716796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3032337115763678, "frac_reward_zero_std": 0.0, "grad_norm": 0.15454235007165662, "kl": 0.020843505859375, "learning_rate": 8.917855903595202e-07, "loss": 0.09, "num_tokens": 842446201.0, "reward": 1.3376116752624512, "reward_std": 0.3645228445529938, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.21985293924808502, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1202.171875, "completions/mean_terminated_length": 906.6415405273438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.30344680624367376, "frac_reward_zero_std": 0.0, "grad_norm": 0.39169152412779784, "kl": 0.026580810546875, "learning_rate": 8.915677372939306e-07, "loss": 0.054, "num_tokens": 843066710.0, "reward": 1.3247768878936768, "reward_std": 0.3521655201911926, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9095982313156128, "rewards/tag_count_reward/std": 0.23394164443016052, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 997.5670166015625, "completions/mean_terminated_length": 856.622802734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3036599009109797, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12774237753487433, "kl": 0.020965576171875, "learning_rate": 8.913496951967434e-07, "loss": 0.0988, "num_tokens": 843588500.0, "reward": 1.6155134439468384, "reward_std": 0.3377233147621155, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137969970703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.2115476429462433, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1059.5045166015625, "completions/mean_terminated_length": 918.290771484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3038729955782857, "frac_reward_zero_std": 0.0, "grad_norm": 0.11622874434177637, "kl": 0.019317626953125, "learning_rate": 8.911314641886279e-07, "loss": 0.0491, "num_tokens": 844132358.0, "reward": 1.5552456378936768, "reward_std": 0.34310492873191833, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.1903373748064041, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 950.9888916015625, "completions/mean_terminated_length": 787.8436279296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3040860902455916, "frac_reward_zero_std": 0.0, "grad_norm": 0.14548367393550904, "kl": 0.021087646484375, "learning_rate": 8.909130443903583e-07, "loss": 0.0942, "num_tokens": 844628193.0, "reward": 1.5474331378936768, "reward_std": 0.3538978695869446, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20298878848552704, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 940.79248046875, "completions/mean_terminated_length": 772.8612060546875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.30429918491289754, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14924011144771562, "kl": 0.022491455078125, "learning_rate": 8.906944359228133e-07, "loss": 0.0863, "num_tokens": 845116692.0, "reward": 1.4754464626312256, "reward_std": 0.34190747141838074, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17203812301158905, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1048.3125, "completions/mean_terminated_length": 814.2258911132812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3045122795802035, "frac_reward_zero_std": 0.0, "grad_norm": 0.1324983142377475, "kl": 0.018096923828125, "learning_rate": 8.904756389069762e-07, "loss": 0.1439, "num_tokens": 845663616.0, "reward": 1.3688616752624512, "reward_std": 0.42432478070259094, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8889508843421936, "rewards/tag_count_reward/std": 0.2493293285369873, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1089.1160888671875, "completions/mean_terminated_length": 893.2150268554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30472537424750945, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12359533576319118, "kl": 0.0184326171875, "learning_rate": 8.902566534639339e-07, "loss": 0.0769, "num_tokens": 846225460.0, "reward": 1.31640625, "reward_std": 0.34393852949142456, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2005011886358261, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 998.60498046875, "completions/mean_terminated_length": 784.2123413085938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3049384689148154, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.20252959677262336, "kl": 0.02154541015625, "learning_rate": 8.900374797148784e-07, "loss": 0.0606, "num_tokens": 846743843.0, "reward": 1.4547991752624512, "reward_std": 0.3482241928577423, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.1992206871509552, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 910.3348388671875, "completions/mean_terminated_length": 751.1195678710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30515156358212137, "frac_reward_zero_std": 0.0, "grad_norm": 0.13393633284827047, "kl": 0.02105712890625, "learning_rate": 8.898181177811056e-07, "loss": 0.0823, "num_tokens": 847221081.0, "reward": 1.469866156578064, "reward_std": 0.3524143397808075, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.199243426322937, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 969.0335083007812, "completions/mean_terminated_length": 824.2607421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3053646582494273, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12199333819665728, "kl": 0.01800537109375, "learning_rate": 8.895985677840153e-07, "loss": 0.0557, "num_tokens": 847716792.0, "reward": 1.575334906578064, "reward_std": 0.4001903831958771, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1747443825006485, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1069.8170166015625, "completions/mean_terminated_length": 806.5665893554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3055777529167333, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 1.7986631416389125, "kl": 0.01934814453125, "learning_rate": 8.89378829845112e-07, "loss": 0.0716, "num_tokens": 848264230.0, "reward": 1.4302456378936768, "reward_std": 0.3747895359992981, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.23921002447605133, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 982.71435546875, "completions/mean_terminated_length": 771.9358520507812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3057908475840392, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12503018963776655, "kl": 0.018768310546875, "learning_rate": 8.891589040860035e-07, "loss": 0.039, "num_tokens": 848773622.0, "reward": 1.4626116752624512, "reward_std": 0.3829914629459381, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.2086479365825653, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 938.7678833007812, "completions/mean_terminated_length": 760.6010131835938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.30600394225134514, "frac_reward_zero_std": 0.0, "grad_norm": 0.13032465986521613, "kl": 0.01953125, "learning_rate": 8.88938790628402e-07, "loss": 0.0855, "num_tokens": 849261438.0, "reward": 1.5200893878936768, "reward_std": 0.34632810950279236, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20075708627700806, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 971.5647583007812, "completions/mean_terminated_length": 778.9395141601562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3062170369186511, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12226323904790523, "kl": 0.01904296875, "learning_rate": 8.887184895941234e-07, "loss": 0.0517, "num_tokens": 849766811.0, "reward": 1.5664063692092896, "reward_std": 0.3269766867160797, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.16479510068893433, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1100.609375, "completions/mean_terminated_length": 862.4385375976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30643013158595706, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12605662153042588, "kl": 0.016448974609375, "learning_rate": 8.884980011050876e-07, "loss": 0.1001, "num_tokens": 850329948.0, "reward": 1.4341518878936768, "reward_std": 0.3133377730846405, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9118303656578064, "rewards/tag_count_reward/std": 0.23598192632198334, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 995.6183471679688, "completions/mean_terminated_length": 804.0238037109375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.306643226253263, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13059056908956307, "kl": 0.017791748046875, "learning_rate": 8.882773252833177e-07, "loss": 0.096, "num_tokens": 850846849.0, "reward": 1.4280134439468384, "reward_std": 0.41415274143218994, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9123883843421936, "rewards/tag_count_reward/std": 0.22682885825634003, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 942.2969360351562, "completions/mean_terminated_length": 790.7537841796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.30685632092056897, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12149133384470984, "kl": 0.02166748046875, "learning_rate": 8.880564622509415e-07, "loss": 0.025, "num_tokens": 851333766.0, "reward": 1.5368304252624512, "reward_std": 0.2814602255821228, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.18040810525417328, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1174.8013916015625, "completions/mean_terminated_length": 958.3258666992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3070694155878749, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11595228758811701, "kl": 0.0159149169921875, "learning_rate": 8.878354121301893e-07, "loss": 0.0889, "num_tokens": 851931405.0, "reward": 1.4012277126312256, "reward_std": 0.3647383451461792, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.21759268641471863, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1204.3504638671875, "completions/mean_terminated_length": 888.6287841796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3072825102551809, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12719975783456006, "kl": 0.0158233642578125, "learning_rate": 8.876141750433957e-07, "loss": 0.0943, "num_tokens": 852544426.0, "reward": 1.3632813692092896, "reward_std": 0.3926142454147339, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2444191426038742, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1024.5625, "completions/mean_terminated_length": 841.4210815429688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3074956049224868, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1344979547262212, "kl": 0.020660400390625, "learning_rate": 8.873927511129985e-07, "loss": 0.0852, "num_tokens": 853074342.0, "reward": 1.493303656578064, "reward_std": 0.4106661379337311, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8839285969734192, "rewards/tag_count_reward/std": 0.2506782114505768, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1126.8013916015625, "completions/mean_terminated_length": 865.4871215820312, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.30770869958979274, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11972460647217888, "kl": 0.016265869140625, "learning_rate": 8.871711404615385e-07, "loss": 0.1005, "num_tokens": 853650717.0, "reward": 1.3493304252624512, "reward_std": 0.35324156284332275, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.2245088666677475, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 966.37060546875, "completions/mean_terminated_length": 839.5960083007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3079217942570987, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13618951326020987, "kl": 0.01934814453125, "learning_rate": 8.869493432116606e-07, "loss": 0.0628, "num_tokens": 854149171.0, "reward": 1.5758929252624512, "reward_std": 0.3624713718891144, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.2171051949262619, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1086.8035888671875, "completions/mean_terminated_length": 861.7300415039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30813488892440466, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12460285505484467, "kl": 0.0168914794921875, "learning_rate": 8.867273594861126e-07, "loss": 0.1097, "num_tokens": 854706651.0, "reward": 1.3828126192092896, "reward_std": 0.29108303785324097, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9185267686843872, "rewards/tag_count_reward/std": 0.2207179069519043, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 978.16748046875, "completions/mean_terminated_length": 766.4893188476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3083479835917106, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12489247846440525, "kl": 0.02056884765625, "learning_rate": 8.865051894077452e-07, "loss": 0.0686, "num_tokens": 855213094.0, "reward": 1.4341518878936768, "reward_std": 0.3647061288356781, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9095982313156128, "rewards/tag_count_reward/std": 0.22541894018650055, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 937.71435546875, "completions/mean_terminated_length": 752.6666870117188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3085610782590166, "frac_reward_zero_std": 0.0, "grad_norm": 0.13967658804301156, "kl": 0.02001953125, "learning_rate": 8.862828330995129e-07, "loss": 0.1596, "num_tokens": 855696086.0, "reward": 1.5178571939468384, "reward_std": 0.35910525918006897, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.2277139127254486, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1096.8504638671875, "completions/mean_terminated_length": 861.0501098632812, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.30877417292632253, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11889917322130462, "kl": 0.0165252685546875, "learning_rate": 8.860602906844726e-07, "loss": 0.1172, "num_tokens": 856258499.0, "reward": 1.442522406578064, "reward_std": 0.3507930338382721, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19407851994037628, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1149.5826416015625, "completions/mean_terminated_length": 929.969482421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.3089872675936285, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12079038909325925, "kl": 0.015411376953125, "learning_rate": 8.858375622857847e-07, "loss": 0.0608, "num_tokens": 856851944.0, "reward": 1.3342634439468384, "reward_std": 0.35259315371513367, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.22604598104953766, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 979.0402221679688, "completions/mean_terminated_length": 794.3507690429688, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.30920036226093445, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15140166753525355, "kl": 0.020233154296875, "learning_rate": 8.856146480267124e-07, "loss": 0.0894, "num_tokens": 857357562.0, "reward": 1.5429688692092896, "reward_std": 0.32985129952430725, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19111628830432892, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1029.4710693359375, "completions/mean_terminated_length": 844.0396118164062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30941345692824035, "frac_reward_zero_std": 0.0, "grad_norm": 0.1506919165887795, "kl": 0.01776123046875, "learning_rate": 8.853915480306215e-07, "loss": 0.1038, "num_tokens": 857890125.0, "reward": 1.5463169813156128, "reward_std": 0.3308275640010834, "rewards/accuracy_reward/mean": 0.6203703880310059, "rewards/accuracy_reward/std": 0.48585736751556396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.17204447090625763, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1104.41748046875, "completions/mean_terminated_length": 889.849365234375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3096265515955463, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12930045866945938, "kl": 0.01800537109375, "learning_rate": 8.851682624209806e-07, "loss": 0.1086, "num_tokens": 858453880.0, "reward": 1.4352679252624512, "reward_std": 0.37976232171058655, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.1891407072544098, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 996.9620971679688, "completions/mean_terminated_length": 785.6273803710938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.30983964626285226, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1267697512715945, "kl": 0.019775390625, "learning_rate": 8.849447913213615e-07, "loss": 0.0887, "num_tokens": 858968295.0, "reward": 1.5424107313156128, "reward_std": 0.34806954860687256, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.2018609493970871, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1156.212158203125, "completions/mean_terminated_length": 935.1281127929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3100527409301582, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12480315459571291, "kl": 0.016510009765625, "learning_rate": 8.847211348554382e-07, "loss": 0.0794, "num_tokens": 859552198.0, "reward": 1.3694196939468384, "reward_std": 0.3284815549850464, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18730680644512177, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 953.122802734375, "completions/mean_terminated_length": 809.3510131835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3102658355974642, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13132016542260447, "kl": 0.01971435546875, "learning_rate": 8.844972931469875e-07, "loss": 0.0786, "num_tokens": 860047325.0, "reward": 1.5842634439468384, "reward_std": 0.3621877431869507, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.21228989958763123, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 965.8795166015625, "completions/mean_terminated_length": 811.290771484375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.31047893026477014, "frac_reward_zero_std": 0.0, "grad_norm": 0.1259837857832474, "kl": 0.02093505859375, "learning_rate": 8.842732663198886e-07, "loss": 0.096, "num_tokens": 860551431.0, "reward": 1.536272406578064, "reward_std": 0.3589436411857605, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.2096388190984726, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1039.6629638671875, "completions/mean_terminated_length": 833.6586303710938, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3106920249320761, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13331623789791222, "kl": 0.01739501953125, "learning_rate": 8.840490544981234e-07, "loss": 0.0482, "num_tokens": 861081632.0, "reward": 1.4609376192092896, "reward_std": 0.3299265503883362, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17640554904937744, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1123.0804443359375, "completions/mean_terminated_length": 880.7774047851562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.31090511959938205, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12041190745192736, "kl": 0.015289306640625, "learning_rate": 8.838246578057757e-07, "loss": 0.0706, "num_tokens": 861657444.0, "reward": 1.5212054252624512, "reward_std": 0.3582844138145447, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19821317493915558, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1093.15185546875, "completions/mean_terminated_length": 898.0752563476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.31111821426668795, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12049862447409308, "kl": 0.01751708984375, "learning_rate": 8.836000763670319e-07, "loss": 0.1311, "num_tokens": 862220024.0, "reward": 1.4587054252624512, "reward_std": 0.355456680059433, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21038557589054108, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1082.078125, "completions/mean_terminated_length": 835.86279296875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.3113313089339939, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13660460735355878, "kl": 0.01715087890625, "learning_rate": 8.833753103061808e-07, "loss": 0.12, "num_tokens": 862777771.0, "reward": 1.4419643878936768, "reward_std": 0.40680035948753357, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9196428656578064, "rewards/tag_count_reward/std": 0.22674697637557983, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1010.2567138671875, "completions/mean_terminated_length": 840.4441528320312, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.31154440360129987, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.140369731126648, "kl": 0.01898193359375, "learning_rate": 8.831503597476131e-07, "loss": 0.088, "num_tokens": 863300750.0, "reward": 1.4592634439468384, "reward_std": 0.35447394847869873, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19590957462787628, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1028.0670166015625, "completions/mean_terminated_length": 829.52001953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3117574982686058, "frac_reward_zero_std": 0.0, "grad_norm": 0.13436031129568018, "kl": 0.018035888671875, "learning_rate": 8.829252248158219e-07, "loss": 0.1591, "num_tokens": 863824716.0, "reward": 1.4570313692092896, "reward_std": 0.384103924036026, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.20502042770385742, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1135.0848388671875, "completions/mean_terminated_length": 869.365966796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3119705929359118, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12471225530275858, "kl": 0.0172119140625, "learning_rate": 8.82699905635402e-07, "loss": 0.061, "num_tokens": 864397026.0, "reward": 1.3666294813156128, "reward_std": 0.35109055042266846, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23580926656723022, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1017.0848388671875, "completions/mean_terminated_length": 819.6754760742188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.31218368760321774, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13100548626235745, "kl": 0.019927978515625, "learning_rate": 8.824744023310504e-07, "loss": 0.1191, "num_tokens": 864913064.0, "reward": 1.5842634439468384, "reward_std": 0.35711053013801575, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.21352127194404602, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 985.6719360351562, "completions/mean_terminated_length": 798.8582763671875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3123967822705237, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11966240567919921, "kl": 0.0216064453125, "learning_rate": 8.822487150275657e-07, "loss": 0.11, "num_tokens": 865421413.0, "reward": 1.473772406578064, "reward_std": 0.3350149989128113, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.18451586365699768, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1094.984375, "completions/mean_terminated_length": 887.8070678710938, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.31260987693782966, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11003275335496294, "kl": 0.016265869140625, "learning_rate": 8.820228438498486e-07, "loss": 0.0617, "num_tokens": 865980750.0, "reward": 1.4235491752624512, "reward_std": 0.3181097209453583, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18088065087795258, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1196.138427734375, "completions/mean_terminated_length": 905.3832397460938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.31282297160513556, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12129027857759477, "kl": 0.0159454345703125, "learning_rate": 8.817967889229018e-07, "loss": 0.0761, "num_tokens": 866591292.0, "reward": 1.3593751192092896, "reward_std": 0.31770387291908264, "rewards/accuracy_reward/mean": 0.44675925374031067, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.20392835140228271, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1013.7902221679688, "completions/mean_terminated_length": 844.5558471679688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3130360662724415, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.21719677367013085, "kl": 0.02008056640625, "learning_rate": 8.815705503718291e-07, "loss": 0.0643, "num_tokens": 867116766.0, "reward": 1.551897406578064, "reward_std": 0.3549552857875824, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.20187872648239136, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 978.419677734375, "completions/mean_terminated_length": 780.3491821289062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3132491609397475, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1426461907580897, "kl": 0.0190887451171875, "learning_rate": 8.813441283218365e-07, "loss": 0.0735, "num_tokens": 867620394.0, "reward": 1.4285714626312256, "reward_std": 0.23254993557929993, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17920349538326263, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1162.2054443359375, "completions/mean_terminated_length": 887.6608276367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.31346225560705343, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11672412391722606, "kl": 0.017486572265625, "learning_rate": 8.811175228982311e-07, "loss": 0.1135, "num_tokens": 868204614.0, "reward": 1.4492188692092896, "reward_std": 0.3632122576236725, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.25323429703712463, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1092.0625, "completions/mean_terminated_length": 810.2543334960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3136753502743594, "frac_reward_zero_std": 0.0, "grad_norm": 0.37867127389765814, "kl": 0.02203369140625, "learning_rate": 8.808907342264215e-07, "loss": 0.0922, "num_tokens": 868773474.0, "reward": 1.313616156578064, "reward_std": 0.32644402980804443, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1910289078950882, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1127.513427734375, "completions/mean_terminated_length": 911.9724731445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.31388844494166535, "frac_reward_zero_std": 0.0, "grad_norm": 0.13179886525244894, "kl": 0.01605224609375, "learning_rate": 8.806637624319181e-07, "loss": 0.1, "num_tokens": 869350568.0, "reward": 1.4040179252624512, "reward_std": 0.3833931088447571, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18981274962425232, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1070.79248046875, "completions/mean_terminated_length": 883.6675415039062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3141015396089713, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1286639624116333, "kl": 0.019805908203125, "learning_rate": 8.804366076403323e-07, "loss": 0.0651, "num_tokens": 869902411.0, "reward": 1.4068081378936768, "reward_std": 0.31713157892227173, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.20184780657291412, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1086.634033203125, "completions/mean_terminated_length": 858.2431030273438, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.31431463427627726, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11614156694184209, "kl": 0.0185546875, "learning_rate": 8.80209269977377e-07, "loss": 0.0299, "num_tokens": 870458551.0, "reward": 1.5033482313156128, "reward_std": 0.2739241123199463, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.166563019156456, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1015.8928833007812, "completions/mean_terminated_length": 801.6819458007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.31452772894358316, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1423653452407796, "kl": 0.01953125, "learning_rate": 8.799817495688662e-07, "loss": 0.1146, "num_tokens": 870977863.0, "reward": 1.5267857313156128, "reward_std": 0.3386031687259674, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.20867261290550232, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1004.3348388671875, "completions/mean_terminated_length": 767.0082397460938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3147408236108891, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13229236091058097, "kl": 0.019775390625, "learning_rate": 8.797540465407148e-07, "loss": 0.1216, "num_tokens": 871498429.0, "reward": 1.4960938692092896, "reward_std": 0.3151138722896576, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.19176188111305237, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1023.43310546875, "completions/mean_terminated_length": 820.7112426757812, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3149539182781951, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11991107627388635, "kl": 0.019500732421875, "learning_rate": 8.795261610189393e-07, "loss": 0.0731, "num_tokens": 872029359.0, "reward": 1.4252232313156128, "reward_std": 0.24590043723583221, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19946885108947754, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1170.4285888671875, "completions/mean_terminated_length": 924.7085571289062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.31516701294550103, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1170126633602437, "kl": 0.0150146484375, "learning_rate": 8.792980931296567e-07, "loss": 0.0762, "num_tokens": 872627919.0, "reward": 1.3493304252624512, "reward_std": 0.3699823319911957, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.2138228863477707, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1090.509033203125, "completions/mean_terminated_length": 879.1825561523438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.315380107612807, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10467877151054188, "kl": 0.01788330078125, "learning_rate": 8.790698429990853e-07, "loss": 0.0278, "num_tokens": 873185363.0, "reward": 1.3392857313156128, "reward_std": 0.2985284924507141, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15527118742465973, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 935.6160888671875, "completions/mean_terminated_length": 789.5454711914062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.31559320228011295, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12118425502343895, "kl": 0.020751953125, "learning_rate": 8.788414107535437e-07, "loss": 0.0705, "num_tokens": 873665911.0, "reward": 1.6367188692092896, "reward_std": 0.27083009481430054, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.46403056383132935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16406624019145966, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1045.68310546875, "completions/mean_terminated_length": 821.1201782226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3158062969474189, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1233821724481232, "kl": 0.020263671875, "learning_rate": 8.786127965194519e-07, "loss": 0.0668, "num_tokens": 874205593.0, "reward": 1.4414063692092896, "reward_std": 0.21451610326766968, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.14879968762397766, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1003.4308471679688, "completions/mean_terminated_length": 826.154052734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.31601939161472486, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.2609139189347654, "kl": 0.0263671875, "learning_rate": 8.783840004233306e-07, "loss": 0.0835, "num_tokens": 874716858.0, "reward": 1.5837054252624512, "reward_std": 0.33571502566337585, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.18280036747455597, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1085.9888916015625, "completions/mean_terminated_length": 813.0974731445312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.31623248628203077, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12123836479665284, "kl": 0.0180511474609375, "learning_rate": 8.781550225918008e-07, "loss": 0.0891, "num_tokens": 875277397.0, "reward": 1.422991156578064, "reward_std": 0.32917022705078125, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20016857981681824, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1035.3035888671875, "completions/mean_terminated_length": 825.1212768554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3164455809493367, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11800745621817481, "kl": 0.01934814453125, "learning_rate": 8.779258631515837e-07, "loss": 0.0241, "num_tokens": 875809997.0, "reward": 1.4642857313156128, "reward_std": 0.2984984517097473, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18769630789756775, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1009.857177734375, "completions/mean_terminated_length": 833.6710205078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3166586756166427, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13044376500978513, "kl": 0.019195556640625, "learning_rate": 8.776965222295023e-07, "loss": 0.0619, "num_tokens": 876328317.0, "reward": 1.5301339626312256, "reward_std": 0.3145487904548645, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18126414716243744, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 967.9620971679688, "completions/mean_terminated_length": 729.5885620117188, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.31687177028394864, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14855206873655472, "kl": 0.018157958984375, "learning_rate": 8.774669999524787e-07, "loss": 0.1464, "num_tokens": 876828540.0, "reward": 1.5535714626312256, "reward_std": 0.3178226947784424, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1243.310302734375, "completions/mean_terminated_length": 935.3425903320312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3170848649512546, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11817442877012374, "kl": 0.0160980224609375, "learning_rate": 8.772372964475362e-07, "loss": 0.0782, "num_tokens": 877456615.0, "reward": 1.2723214626312256, "reward_std": 0.37030497193336487, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.2609747648239136, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1049.3773193359375, "completions/mean_terminated_length": 845.3575439453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.31729795961856055, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12894982420864745, "kl": 0.0176544189453125, "learning_rate": 8.770074118417981e-07, "loss": 0.0673, "num_tokens": 877995568.0, "reward": 1.5223214626312256, "reward_std": 0.3191626965999603, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.21056649088859558, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1030.9554443359375, "completions/mean_terminated_length": 746.182861328125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3175110542858665, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12753345774012528, "kl": 0.017669677734375, "learning_rate": 8.767773462624876e-07, "loss": 0.1138, "num_tokens": 878527660.0, "reward": 1.3476563692092896, "reward_std": 0.34932416677474976, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.22250016033649445, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 995.2410888671875, "completions/mean_terminated_length": 813.3507690429688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.31772414895317247, "frac_reward_zero_std": 0.0, "grad_norm": 0.12915485732880436, "kl": 0.01953125, "learning_rate": 8.765470998369286e-07, "loss": 0.1138, "num_tokens": 879035272.0, "reward": 1.528459906578064, "reward_std": 0.3196329176425934, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17854659259319305, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 993.294677734375, "completions/mean_terminated_length": 781.2225341796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.31793724362047837, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12991545887580488, "kl": 0.01861572265625, "learning_rate": 8.76316672692545e-07, "loss": 0.078, "num_tokens": 879549996.0, "reward": 1.547991156578064, "reward_std": 0.3738611936569214, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.20101501047611237, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1068.118408203125, "completions/mean_terminated_length": 807.9237060546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.31815033828778433, "frac_reward_zero_std": 0.0, "grad_norm": 0.12911229925940468, "kl": 0.018768310546875, "learning_rate": 8.760860649568605e-07, "loss": 0.1133, "num_tokens": 880094529.0, "reward": 1.419084906578064, "reward_std": 0.3558329939842224, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8967633843421936, "rewards/tag_count_reward/std": 0.25267162919044495, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1023.5870971679688, "completions/mean_terminated_length": 830.6604614257812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3183634329550903, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1315496382652415, "kl": 0.017730712890625, "learning_rate": 8.758552767574988e-07, "loss": 0.0821, "num_tokens": 880618920.0, "reward": 1.4910714626312256, "reward_std": 0.3212418556213379, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20194751024246216, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 980.8370971679688, "completions/mean_terminated_length": 779.859375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.31857652762239624, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1288031127749245, "kl": 0.0185546875, "learning_rate": 8.756243082221834e-07, "loss": 0.0803, "num_tokens": 881128575.0, "reward": 1.4017857313156128, "reward_std": 0.2610216736793518, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18918029963970184, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1080.9910888671875, "completions/mean_terminated_length": 870.771728515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3187896222897022, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13064185902042835, "kl": 0.0174560546875, "learning_rate": 8.75393159478738e-07, "loss": 0.1431, "num_tokens": 881678395.0, "reward": 1.4598214626312256, "reward_std": 0.42064374685287476, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.23766089975833893, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1060.46435546875, "completions/mean_terminated_length": 794.6968994140625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.31900271695700816, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11573500887048836, "kl": 0.016632080078125, "learning_rate": 8.751618306550855e-07, "loss": 0.0628, "num_tokens": 882221195.0, "reward": 1.4754464626312256, "reward_std": 0.35096055269241333, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.21043601632118225, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1027.77685546875, "completions/mean_terminated_length": 799.2021484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3192158116243141, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15932131728619467, "kl": 0.0181884765625, "learning_rate": 8.749303218792486e-07, "loss": 0.0722, "num_tokens": 882751751.0, "reward": 1.4207589626312256, "reward_std": 0.327421098947525, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19321222603321075, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1055.53125, "completions/mean_terminated_length": 836.4849853515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3194289062916201, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12410364282696955, "kl": 0.01806640625, "learning_rate": 8.7469863327935e-07, "loss": 0.1086, "num_tokens": 883306277.0, "reward": 1.4196429252624512, "reward_std": 0.3600113093852997, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.21043600142002106, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1065.3482666015625, "completions/mean_terminated_length": 854.97021484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.319642000958926, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13419866764655478, "kl": 0.017852783203125, "learning_rate": 8.744667649836114e-07, "loss": 0.0812, "num_tokens": 883850225.0, "reward": 1.4587054252624512, "reward_std": 0.3719674050807953, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.18882031738758087, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1178.9710693359375, "completions/mean_terminated_length": 899.5486450195312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.31985509562623193, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11262308267304726, "kl": 0.0153350830078125, "learning_rate": 8.742347171203541e-07, "loss": 0.0092, "num_tokens": 884443972.0, "reward": 1.286272406578064, "reward_std": 0.372450053691864, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.21360310912132263, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1080.5670166015625, "completions/mean_terminated_length": 784.4140014648438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3200681902935379, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12876074398984771, "kl": 0.0179443359375, "learning_rate": 8.74002489817999e-07, "loss": 0.0611, "num_tokens": 885004754.0, "reward": 1.3934152126312256, "reward_std": 0.37429845333099365, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.2174951434135437, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1199.529052734375, "completions/mean_terminated_length": 867.5186767578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.32028128496084385, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11607804143545178, "kl": 0.0154571533203125, "learning_rate": 8.73770083205066e-07, "loss": 0.1213, "num_tokens": 885606207.0, "reward": 1.3716518878936768, "reward_std": 0.3596622049808502, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.22042357921600342, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1093.1138916015625, "completions/mean_terminated_length": 872.7554931640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3204943796281498, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12971223964835987, "kl": 0.0174560546875, "learning_rate": 8.735374974101746e-07, "loss": 0.1606, "num_tokens": 886161170.0, "reward": 1.4614956378936768, "reward_std": 0.4222429394721985, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9034598469734192, "rewards/tag_count_reward/std": 0.2353641390800476, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1123.7679443359375, "completions/mean_terminated_length": 844.3488159179688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.32070747429545576, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13214166459449658, "kl": 0.0168914794921875, "learning_rate": 8.73304732562043e-07, "loss": 0.0892, "num_tokens": 886732666.0, "reward": 1.313616156578064, "reward_std": 0.3497132658958435, "rewards/accuracy_reward/mean": 0.41435185074806213, "rewards/accuracy_reward/std": 0.49318093061447144, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2302192598581314, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 945.1183471679688, "completions/mean_terminated_length": 800.2954711914062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3209205689627617, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13326196932982115, "kl": 0.020751953125, "learning_rate": 8.730717887894887e-07, "loss": 0.0694, "num_tokens": 887218095.0, "reward": 1.5954241752624512, "reward_std": 0.26193109154701233, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17706511914730072, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 941.0178833007812, "completions/mean_terminated_length": 776.3897705078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3211336636300677, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1482744019615174, "kl": 0.017669677734375, "learning_rate": 8.728386662214284e-07, "loss": 0.1017, "num_tokens": 887704055.0, "reward": 1.4693081378936768, "reward_std": 0.35213610529899597, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.18900123238563538, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 947.3303833007812, "completions/mean_terminated_length": 790.091796875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.32134675829737364, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12604777432422962, "kl": 0.020721435546875, "learning_rate": 8.726053649868776e-07, "loss": 0.0869, "num_tokens": 888203803.0, "reward": 1.5452009439468384, "reward_std": 0.36416730284690857, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.19597965478897095, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 984.97998046875, "completions/mean_terminated_length": 801.3167724609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.32155985296467954, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11593775488836382, "kl": 0.0166778564453125, "learning_rate": 8.723718852149506e-07, "loss": 0.0813, "num_tokens": 888721138.0, "reward": 1.3727679252624512, "reward_std": 0.32188543677330017, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15467506647109985, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1081.180908203125, "completions/mean_terminated_length": 858.0687255859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3217729476319855, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1304980965494064, "kl": 0.0179443359375, "learning_rate": 8.721382270348604e-07, "loss": 0.0917, "num_tokens": 889271523.0, "reward": 1.4704241752624512, "reward_std": 0.3440109193325043, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.19078285992145538, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1077.49560546875, "completions/mean_terminated_length": 866.5162963867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32198604229929145, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12638790336304703, "kl": 0.016021728515625, "learning_rate": 8.719043905759193e-07, "loss": 0.1131, "num_tokens": 889826753.0, "reward": 1.4988839626312256, "reward_std": 0.36405402421951294, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.21358920633792877, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1068.8148193359375, "completions/mean_terminated_length": 805.2946166992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3221991369665974, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12852018396247916, "kl": 0.01715087890625, "learning_rate": 8.716703759675376e-07, "loss": 0.1277, "num_tokens": 890378574.0, "reward": 1.3158482313156128, "reward_std": 0.3472348749637604, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.2100292444229126, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1003.5022583007812, "completions/mean_terminated_length": 783.3108520507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32241223163390337, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12722051238431567, "kl": 0.0179443359375, "learning_rate": 8.714361833392246e-07, "loss": 0.0931, "num_tokens": 890896031.0, "reward": 1.3727679252624512, "reward_std": 0.3549431264400482, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 898.294677734375, "completions/mean_terminated_length": 727.3128662109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3226253263012093, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12412175829217294, "kl": 0.020660400390625, "learning_rate": 8.712018128205882e-07, "loss": 0.0578, "num_tokens": 891364387.0, "reward": 1.6568081378936768, "reward_std": 0.26207587122917175, "rewards/accuracy_reward/mean": 0.6941964030265808, "rewards/accuracy_reward/std": 0.4612620174884796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14432783424854279, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1040.7567138671875, "completions/mean_terminated_length": 808.3159790039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3228384209685153, "frac_reward_zero_std": 0.0, "grad_norm": 0.12382511975757346, "kl": 0.018341064453125, "learning_rate": 8.709672645413339e-07, "loss": 0.0903, "num_tokens": 891898438.0, "reward": 1.5161831378936768, "reward_std": 0.3817310631275177, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.18738260865211487, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1024.6273193359375, "completions/mean_terminated_length": 841.4973754882812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.32305151563582124, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12644921259199857, "kl": 0.018890380859375, "learning_rate": 8.707325386312669e-07, "loss": 0.06, "num_tokens": 892427679.0, "reward": 1.3392857313156128, "reward_std": 0.3286016881465912, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1906658411026001, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 987.0223388671875, "completions/mean_terminated_length": 745.7589111328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.32326461030312714, "frac_reward_zero_std": 0.0, "grad_norm": 0.13740912482892004, "kl": 0.02032470703125, "learning_rate": 8.704976352202896e-07, "loss": 0.0765, "num_tokens": 892944841.0, "reward": 1.4001116752624512, "reward_std": 0.3409937918186188, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.20590755343437195, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1041.2076416015625, "completions/mean_terminated_length": 812.2657470703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3234777049704331, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1244034618128558, "kl": 0.01959228515625, "learning_rate": 8.702625544384034e-07, "loss": 0.0806, "num_tokens": 893479014.0, "reward": 1.4179688692092896, "reward_std": 0.31756457686424255, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20640410482883453, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1098.6429443359375, "completions/mean_terminated_length": 863.286865234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32369079963773906, "frac_reward_zero_std": 0.0, "grad_norm": 0.12279058836237525, "kl": 0.01806640625, "learning_rate": 8.700272964157072e-07, "loss": 0.133, "num_tokens": 894046870.0, "reward": 1.4168527126312256, "reward_std": 0.388259619474411, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8967633843421936, "rewards/tag_count_reward/std": 0.24988947808742523, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1085.6429443359375, "completions/mean_terminated_length": 853.7174072265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.323903894305045, "frac_reward_zero_std": 0.0, "grad_norm": 0.12250596582811452, "kl": 0.017059326171875, "learning_rate": 8.697918612823985e-07, "loss": 0.1056, "num_tokens": 894608422.0, "reward": 1.4754464626312256, "reward_std": 0.37866777181625366, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855977296829224, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.2012539505958557, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1076.435302734375, "completions/mean_terminated_length": 871.6189575195312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.324116988972351, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1243985825296824, "kl": 0.01959228515625, "learning_rate": 8.695562491687726e-07, "loss": 0.0999, "num_tokens": 895155721.0, "reward": 1.5412946939468384, "reward_std": 0.3647215962409973, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.2131679654121399, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1125.622802734375, "completions/mean_terminated_length": 843.262451171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.32433008363965693, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13405982126286933, "kl": 0.01788330078125, "learning_rate": 8.693204602052225e-07, "loss": 0.1024, "num_tokens": 895727792.0, "reward": 1.3632813692092896, "reward_std": 0.30096372961997986, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19155997037887573, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1018.0491333007812, "completions/mean_terminated_length": 843.2532958984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3245431783069629, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11830001310244277, "kl": 0.018768310546875, "learning_rate": 8.690844945222397e-07, "loss": 0.0569, "num_tokens": 896245926.0, "reward": 1.5424107313156128, "reward_std": 0.33029937744140625, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.17515915632247925, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 978.24560546875, "completions/mean_terminated_length": 831.62939453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32475627297426884, "frac_reward_zero_std": 0.0, "grad_norm": 0.13150153059147593, "kl": 0.020477294921875, "learning_rate": 8.688483522504131e-07, "loss": 0.0823, "num_tokens": 896752228.0, "reward": 1.567522406578064, "reward_std": 0.37251636385917664, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19617067277431488, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1131.9866943359375, "completions/mean_terminated_length": 855.0523071289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.32496936764157475, "frac_reward_zero_std": 0.25, "grad_norm": 0.6671070064468791, "kl": 0.0179443359375, "learning_rate": 8.686120335204291e-07, "loss": 0.0679, "num_tokens": 897336030.0, "reward": 1.387834906578064, "reward_std": 0.2983705699443817, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1540675312280655, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1053.087158203125, "completions/mean_terminated_length": 833.5013427734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3251824623088807, "frac_reward_zero_std": 0.0, "grad_norm": 0.1284438081510803, "kl": 0.019622802734375, "learning_rate": 8.683755384630724e-07, "loss": 0.0953, "num_tokens": 897883013.0, "reward": 1.430803656578064, "reward_std": 0.36074790358543396, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20961573719978333, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1106.8013916015625, "completions/mean_terminated_length": 850.11083984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32539555697618666, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1358512082235821, "kl": 0.01702880859375, "learning_rate": 8.681388672092247e-07, "loss": 0.1262, "num_tokens": 898448972.0, "reward": 1.3766741752624512, "reward_std": 0.396701455116272, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.2396480143070221, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1009.310302734375, "completions/mean_terminated_length": 807.1119995117188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3256086516434926, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13397808919569004, "kl": 0.021820068359375, "learning_rate": 8.679020198898654e-07, "loss": 0.076, "num_tokens": 898969735.0, "reward": 1.4804688692092896, "reward_std": 0.30424928665161133, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19111628830432892, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1095.3148193359375, "completions/mean_terminated_length": 915.8965454101562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3258217463107986, "frac_reward_zero_std": 0.0, "grad_norm": 0.12875118069556546, "kl": 0.0185394287109375, "learning_rate": 8.676649966360714e-07, "loss": 0.0822, "num_tokens": 899524980.0, "reward": 1.5078126192092896, "reward_std": 0.32376590371131897, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20024341344833374, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 963.2567138671875, "completions/mean_terminated_length": 785.7532348632812, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.32603484097810453, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11945135308737737, "kl": 0.019805908203125, "learning_rate": 8.67427797579017e-07, "loss": 0.0858, "num_tokens": 900030551.0, "reward": 1.4944196939468384, "reward_std": 0.27198731899261475, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.15819832682609558, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1041.825927734375, "completions/mean_terminated_length": 826.4119262695312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3262479356454105, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11744061159742629, "kl": 0.019317626953125, "learning_rate": 8.671904228499737e-07, "loss": 0.0359, "num_tokens": 900568249.0, "reward": 1.4804688692092896, "reward_std": 0.30808940529823303, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18297357857227325, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1057.247802734375, "completions/mean_terminated_length": 801.210693359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.32646103031271645, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12485111826515224, "kl": 0.017364501953125, "learning_rate": 8.669528725803102e-07, "loss": 0.0855, "num_tokens": 901111544.0, "reward": 1.4001116752624512, "reward_std": 0.37890103459358215, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.21437901258468628, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 950.7701416015625, "completions/mean_terminated_length": 740.6622314453125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.32667412498002235, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1724657807321857, "kl": 0.02105712890625, "learning_rate": 8.667151469014923e-07, "loss": 0.1136, "num_tokens": 901601505.0, "reward": 1.5976563692092896, "reward_std": 0.34070268273353577, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18863096833229065, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1034.852783203125, "completions/mean_terminated_length": 811.2424926757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3268872196473283, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15902213604534607, "kl": 0.018341064453125, "learning_rate": 8.664772459450831e-07, "loss": 0.0698, "num_tokens": 902135023.0, "reward": 1.4157366752624512, "reward_std": 0.34358060359954834, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.1962660849094391, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1070.6898193359375, "completions/mean_terminated_length": 838.5111083984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32710031431463427, "frac_reward_zero_std": 0.0, "grad_norm": 0.1233316798091126, "kl": 0.018157958984375, "learning_rate": 8.662391698427426e-07, "loss": 0.1105, "num_tokens": 902679860.0, "reward": 1.4068081378936768, "reward_std": 0.38109514117240906, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.23402369022369385, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1028.4241943359375, "completions/mean_terminated_length": 813.4865112304688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3273134089819402, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13722982106341639, "kl": 0.0196533203125, "learning_rate": 8.660009187262277e-07, "loss": 0.0712, "num_tokens": 903205618.0, "reward": 1.5412946939468384, "reward_std": 0.2835904359817505, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17065013945102692, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1024.375, "completions/mean_terminated_length": 818.5523071289062, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3275265036492462, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14239299096983507, "kl": 0.019989013671875, "learning_rate": 8.657624927273919e-07, "loss": 0.0839, "num_tokens": 903730394.0, "reward": 1.3883929252624512, "reward_std": 0.2794315218925476, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19922147691249847, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 950.950927734375, "completions/mean_terminated_length": 774.7409057617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32773959831655214, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12586889278687977, "kl": 0.019439697265625, "learning_rate": 8.65523891978186e-07, "loss": 0.0559, "num_tokens": 904232180.0, "reward": 1.5345982313156128, "reward_std": 0.3146071434020996, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17708362638950348, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1199.015625, "completions/mean_terminated_length": 909.2425537109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3279526929838581, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12037489857620279, "kl": 0.016448974609375, "learning_rate": 8.652851166106573e-07, "loss": 0.11, "num_tokens": 904840571.0, "reward": 1.3950893878936768, "reward_std": 0.38501065969467163, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.22758229076862335, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1155.6920166015625, "completions/mean_terminated_length": 899.2816162109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32816578765116405, "frac_reward_zero_std": 0.0, "grad_norm": 0.125584265905455, "kl": 0.017059326171875, "learning_rate": 8.650461667569495e-07, "loss": 0.1083, "num_tokens": 905427617.0, "reward": 1.4481027126312256, "reward_std": 0.4261804223060608, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.22890526056289673, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1157.665283203125, "completions/mean_terminated_length": 924.4224853515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32837888231846996, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11284061539316523, "kl": 0.01806640625, "learning_rate": 8.648070425493031e-07, "loss": 0.0466, "num_tokens": 906014939.0, "reward": 1.5022321939468384, "reward_std": 0.35935840010643005, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19424648582935333, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1055.7545166015625, "completions/mean_terminated_length": 840.0489501953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3285919769857759, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13001606876846988, "kl": 0.017578125, "learning_rate": 8.645677441200551e-07, "loss": 0.0696, "num_tokens": 906560957.0, "reward": 1.3738839626312256, "reward_std": 0.2935698628425598, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17708362638950348, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1166.296875, "completions/mean_terminated_length": 922.6353149414062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.32880507165308187, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1047703457229358, "kl": 0.017791748046875, "learning_rate": 8.643282716016388e-07, "loss": 0.0467, "num_tokens": 907152178.0, "reward": 1.422991156578064, "reward_std": 0.33721673488616943, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20895659923553467, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1134.5960693359375, "completions/mean_terminated_length": 929.9535522460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32901816632038783, "frac_reward_zero_std": 0.0, "grad_norm": 0.11804426593857516, "kl": 0.019012451171875, "learning_rate": 8.640886251265839e-07, "loss": 0.0087, "num_tokens": 907730077.0, "reward": 1.5083706378936768, "reward_std": 0.3407495617866516, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18076327443122864, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1204.4263916015625, "completions/mean_terminated_length": 977.4022827148438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3292312609876938, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10595729823429317, "kl": 0.0156097412109375, "learning_rate": 8.638488048275166e-07, "loss": 0.0311, "num_tokens": 908342508.0, "reward": 1.426897406578064, "reward_std": 0.34525033831596375, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.19482967257499695, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1026.997802734375, "completions/mean_terminated_length": 821.7024536132812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.32944435565499974, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12195332760302954, "kl": 0.017730712890625, "learning_rate": 8.636088108371588e-07, "loss": 0.1093, "num_tokens": 908870347.0, "reward": 1.587053656578064, "reward_std": 0.3584185242652893, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18620048463344574, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 906.904052734375, "completions/mean_terminated_length": 753.794921875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3296574503223057, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1307723429054731, "kl": 0.020294189453125, "learning_rate": 8.633686432883289e-07, "loss": 0.091, "num_tokens": 909347008.0, "reward": 1.6238839626312256, "reward_std": 0.26169130206108093, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16231150925159454, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1108.0023193359375, "completions/mean_terminated_length": 848.2307739257812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.32987054498961166, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13747905334302876, "kl": 0.018402099609375, "learning_rate": 8.631283023139413e-07, "loss": 0.0845, "num_tokens": 909908129.0, "reward": 1.4587054252624512, "reward_std": 0.3720957338809967, "rewards/accuracy_reward/mean": 0.5462962985038757, "rewards/accuracy_reward/std": 0.49842923879623413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.1945772022008896, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1013.4375610351562, "completions/mean_terminated_length": 795.340576171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33008363965691756, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1393923780844507, "kl": 0.019775390625, "learning_rate": 8.628877880470062e-07, "loss": 0.1057, "num_tokens": 910423925.0, "reward": 1.5329241752624512, "reward_std": 0.3651920258998871, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.21694914996623993, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1132.1942138671875, "completions/mean_terminated_length": 785.596923828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3302967343242235, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1101738653267016, "kl": 0.015960693359375, "learning_rate": 8.6264710062063e-07, "loss": 0.0912, "num_tokens": 911001276.0, "reward": 1.3270089626312256, "reward_std": 0.3444467782974243, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.2043900191783905, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1040.6607666015625, "completions/mean_terminated_length": 841.3475952148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3305098289915295, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11522031122400136, "kl": 0.01885986328125, "learning_rate": 8.624062401680148e-07, "loss": 0.0973, "num_tokens": 911540996.0, "reward": 1.5842634439468384, "reward_std": 0.38823845982551575, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17547869682312012, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1038.415283203125, "completions/mean_terminated_length": 828.878662109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33072292365883543, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13906163516479314, "kl": 0.016845703125, "learning_rate": 8.621652068224582e-07, "loss": 0.0784, "num_tokens": 912079230.0, "reward": 1.3945313692092896, "reward_std": 0.3579113185405731, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20285958051681519, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1086.305908203125, "completions/mean_terminated_length": 830.940673828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3309360183261414, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.132157395439325, "kl": 0.018096923828125, "learning_rate": 8.619240007173541e-07, "loss": 0.0815, "num_tokens": 912637815.0, "reward": 1.4397321939468384, "reward_std": 0.3675016164779663, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.2259417027235031, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1042.571533203125, "completions/mean_terminated_length": 868.858642578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33114911299344735, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12823196655768887, "kl": 0.019683837890625, "learning_rate": 8.61682621986191e-07, "loss": 0.1226, "num_tokens": 913173367.0, "reward": 1.5022321939468384, "reward_std": 0.3428337275981903, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.19624143838882446, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1011.5357666015625, "completions/mean_terminated_length": 772.3516845703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3313622076607533, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11278272137202248, "kl": 0.01922607421875, "learning_rate": 8.61441070762554e-07, "loss": 0.0378, "num_tokens": 913690855.0, "reward": 1.4001116752624512, "reward_std": 0.25777122378349304, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21654021739959717, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1036.6629638671875, "completions/mean_terminated_length": 858.8162841796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.33157530232805926, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11554814236829952, "kl": 0.0181884765625, "learning_rate": 8.611993471801232e-07, "loss": 0.0325, "num_tokens": 914225344.0, "reward": 1.3521206378936768, "reward_std": 0.2653568983078003, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.16564138233661652, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 919.8906860351562, "completions/mean_terminated_length": 710.9814453125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.33178839699536516, "frac_reward_zero_std": 0.0, "grad_norm": 0.14870361635136725, "kl": 0.019561767578125, "learning_rate": 8.609574513726739e-07, "loss": 0.1583, "num_tokens": 914703423.0, "reward": 1.3939732313156128, "reward_std": 0.33611148595809937, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.21293358504772186, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1093.868408203125, "completions/mean_terminated_length": 775.8244018554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3320014916626711, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1296739443720335, "kl": 0.0181884765625, "learning_rate": 8.607153834740771e-07, "loss": 0.0868, "num_tokens": 915264212.0, "reward": 1.3967634439468384, "reward_std": 0.37327054142951965, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.21386010944843292, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1032.743408203125, "completions/mean_terminated_length": 866.6103515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3322145863299771, "frac_reward_zero_std": 0.0, "grad_norm": 0.13803957140549328, "kl": 0.018585205078125, "learning_rate": 8.604731436182988e-07, "loss": 0.0823, "num_tokens": 915794305.0, "reward": 1.4949777126312256, "reward_std": 0.34452149271965027, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18260475993156433, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1014.08935546875, "completions/mean_terminated_length": 825.8575439453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33242768099728304, "frac_reward_zero_std": 0.0, "grad_norm": 0.12731579006375046, "kl": 0.020782470703125, "learning_rate": 8.602307319394001e-07, "loss": 0.1107, "num_tokens": 916315113.0, "reward": 1.4949777126312256, "reward_std": 0.3242989778518677, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18639397621154785, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 988.79248046875, "completions/mean_terminated_length": 802.5275268554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.332640775664589, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13914407672937937, "kl": 0.018951416015625, "learning_rate": 8.599881485715374e-07, "loss": 0.0934, "num_tokens": 916829132.0, "reward": 1.5290179252624512, "reward_std": 0.3423592448234558, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18086770176887512, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1046.497802734375, "completions/mean_terminated_length": 798.2144775390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33285387033189495, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11161299470516874, "kl": 0.018096923828125, "learning_rate": 8.597453936489623e-07, "loss": 0.077, "num_tokens": 917367595.0, "reward": 1.4849331378936768, "reward_std": 0.33410611748695374, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19267114996910095, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1030.1273193359375, "completions/mean_terminated_length": 795.2335205078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3330669649992009, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13171322853169354, "kl": 0.01971435546875, "learning_rate": 8.595024673060204e-07, "loss": 0.1117, "num_tokens": 917895044.0, "reward": 1.5156251192092896, "reward_std": 0.3379441797733307, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.20910291373729706, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 971.24560546875, "completions/mean_terminated_length": 817.4234619140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33328005966650687, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12838801195280714, "kl": 0.020477294921875, "learning_rate": 8.592593696771538e-07, "loss": 0.0671, "num_tokens": 918398610.0, "reward": 1.5552456378936768, "reward_std": 0.3218420147895813, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17538617551326752, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1021.1094360351562, "completions/mean_terminated_length": 797.872314453125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.33349315433381277, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1198834553972838, "kl": 0.021087646484375, "learning_rate": 8.590161008968975e-07, "loss": 0.0434, "num_tokens": 918919443.0, "reward": 1.5468751192092896, "reward_std": 0.29237571358680725, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.16445480287075043, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1155.638427734375, "completions/mean_terminated_length": 934.4122314453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3337062490011187, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11622894785731466, "kl": 0.0168304443359375, "learning_rate": 8.587726610998824e-07, "loss": 0.0745, "num_tokens": 919513489.0, "reward": 1.3666294813156128, "reward_std": 0.32794949412345886, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9135044813156128, "rewards/tag_count_reward/std": 0.2247832715511322, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1216.8773193359375, "completions/mean_terminated_length": 862.1942749023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3339193436684247, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12281726539302042, "kl": 0.016082763671875, "learning_rate": 8.585290504208341e-07, "loss": 0.0703, "num_tokens": 920124906.0, "reward": 1.2712054252624512, "reward_std": 0.33053600788116455, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.21358920633792877, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 954.0960083007812, "completions/mean_terminated_length": 730.6102294921875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.33413243833573064, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13504144612283336, "kl": 0.019683837890625, "learning_rate": 8.582852689945722e-07, "loss": 0.0947, "num_tokens": 920622165.0, "reward": 1.551897406578064, "reward_std": 0.29022881388664246, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.17221127450466156, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1005.6094360351562, "completions/mean_terminated_length": 865.7443237304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3343455330030366, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13380214733181806, "kl": 0.019317626953125, "learning_rate": 8.580413169560112e-07, "loss": 0.1117, "num_tokens": 921135782.0, "reward": 1.5680804252624512, "reward_std": 0.30227774381637573, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17561113834381104, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1239.716552734375, "completions/mean_terminated_length": 940.626953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33455862767034256, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12449077140104582, "kl": 0.014678955078125, "learning_rate": 8.577971944401598e-07, "loss": 0.1119, "num_tokens": 921755911.0, "reward": 1.3415179252624512, "reward_std": 0.38318243622779846, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8861607313156128, "rewards/tag_count_reward/std": 0.26522624492645264, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1074.134033203125, "completions/mean_terminated_length": 852.6795043945312, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.3347717223376485, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14544190800650036, "kl": 0.016754150390625, "learning_rate": 8.575529015821212e-07, "loss": 0.1399, "num_tokens": 922310067.0, "reward": 1.395647406578064, "reward_std": 0.3073558807373047, "rewards/accuracy_reward/mean": 0.48379629850387573, "rewards/accuracy_reward/std": 0.5003167986869812, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.2196313887834549, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1033.024658203125, "completions/mean_terminated_length": 791.8978271484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33498481700495447, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.133962976415893, "kl": 0.020050048828125, "learning_rate": 8.573084385170927e-07, "loss": 0.0712, "num_tokens": 922847822.0, "reward": 1.563616156578064, "reward_std": 0.34568989276885986, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18280036747455597, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1098.3013916015625, "completions/mean_terminated_length": 907.3432006835938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.33519791167226043, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13238663547720864, "kl": 0.01739501953125, "learning_rate": 8.570638053803659e-07, "loss": 0.0763, "num_tokens": 923410213.0, "reward": 1.4631696939468384, "reward_std": 0.3122788965702057, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.17409753799438477, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1031.63623046875, "completions/mean_terminated_length": 830.5374755859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33541100633956633, "frac_reward_zero_std": 0.0, "grad_norm": 0.1452290616321746, "kl": 0.019439697265625, "learning_rate": 8.568190023073265e-07, "loss": 0.0984, "num_tokens": 923938002.0, "reward": 1.4960938692092896, "reward_std": 0.3545323610305786, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9291294813156128, "rewards/tag_count_reward/std": 0.20514829456806183, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1081.075927734375, "completions/mean_terminated_length": 908.0474243164062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3356241010068723, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11712048381827228, "kl": 0.017974853515625, "learning_rate": 8.565740294334544e-07, "loss": 0.0641, "num_tokens": 924492500.0, "reward": 1.5312501192092896, "reward_std": 0.2865827977657318, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1884397715330124, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1043.075927734375, "completions/mean_terminated_length": 850.6436157226562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.33583719567417825, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13070003789527895, "kl": 0.020538330078125, "learning_rate": 8.563288868943232e-07, "loss": 0.0453, "num_tokens": 925038982.0, "reward": 1.5379464626312256, "reward_std": 0.3534909188747406, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.1970413774251938, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1010.8750610351562, "completions/mean_terminated_length": 850.4948120117188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3360502903414842, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12501885986782266, "kl": 0.018585205078125, "learning_rate": 8.560835748256007e-07, "loss": 0.0981, "num_tokens": 925561102.0, "reward": 1.598772406578064, "reward_std": 0.32354316115379333, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1787492334842682, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1033.4375, "completions/mean_terminated_length": 799.3077392578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33626338500879016, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11169122723076931, "kl": 0.01708984375, "learning_rate": 8.55838093363048e-07, "loss": 0.0422, "num_tokens": 926096178.0, "reward": 1.4609376192092896, "reward_std": 0.34643271565437317, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.19318637251853943, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1138.977783203125, "completions/mean_terminated_length": 900.8394165039062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3364764796760961, "frac_reward_zero_std": 0.0, "grad_norm": 0.11359512424902131, "kl": 0.0154266357421875, "learning_rate": 8.555924426425209e-07, "loss": 0.0612, "num_tokens": 926674664.0, "reward": 1.481584906578064, "reward_std": 0.3570943772792816, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.1951945573091507, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1124.700927734375, "completions/mean_terminated_length": 911.6318969726562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3366895743434021, "frac_reward_zero_std": 0.0, "grad_norm": 0.12104951916944742, "kl": 0.017547607421875, "learning_rate": 8.553466227999675e-07, "loss": 0.0544, "num_tokens": 927253618.0, "reward": 1.4587054252624512, "reward_std": 0.3643728792667389, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.20681875944137573, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1041.7054443359375, "completions/mean_terminated_length": 855.3544921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.33690266901070803, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12576589918029712, "kl": 0.020172119140625, "learning_rate": 8.551006339714308e-07, "loss": 0.0497, "num_tokens": 927787726.0, "reward": 1.5496652126312256, "reward_std": 0.34323054552078247, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295229911804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.18222837150096893, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 976.0826416015625, "completions/mean_terminated_length": 803.9093017578125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.33711576367801394, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.131544968487818, "kl": 0.01971435546875, "learning_rate": 8.548544762930469e-07, "loss": 0.1391, "num_tokens": 928285747.0, "reward": 1.4140626192092896, "reward_std": 0.35008323192596436, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20490244030952454, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1060.6763916015625, "completions/mean_terminated_length": 826.1188354492188, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3373288583453199, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1192012317907316, "kl": 0.01849365234375, "learning_rate": 8.546081499010449e-07, "loss": 0.0902, "num_tokens": 928825474.0, "reward": 1.4977679252624512, "reward_std": 0.366095632314682, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20075708627700806, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1057.930908203125, "completions/mean_terminated_length": 862.0347900390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33754195301262585, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10822808637642121, "kl": 0.017669677734375, "learning_rate": 8.543616549317475e-07, "loss": 0.0508, "num_tokens": 929366435.0, "reward": 1.4291294813156128, "reward_std": 0.2421514391899109, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.16769373416900635, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1055.171875, "completions/mean_terminated_length": 842.6151733398438, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3377550476799318, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12156121799100028, "kl": 0.01904296875, "learning_rate": 8.541149915215711e-07, "loss": 0.061, "num_tokens": 929906784.0, "reward": 1.571428656578064, "reward_std": 0.30008241534233093, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.20923420786857605, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1114.7991943359375, "completions/mean_terminated_length": 873.6348266601562, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.33796814234723777, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12698244792293686, "kl": 0.018463134765625, "learning_rate": 8.538681598070248e-07, "loss": 0.0797, "num_tokens": 930476662.0, "reward": 1.5106027126312256, "reward_std": 0.32206472754478455, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.20484988391399384, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1074.54248046875, "completions/mean_terminated_length": 878.8070068359375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3381812370145437, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.234242915234687, "kl": 0.0255126953125, "learning_rate": 8.536211599247114e-07, "loss": 0.1117, "num_tokens": 931036441.0, "reward": 1.3454241752624512, "reward_std": 0.2859877347946167, "rewards/accuracy_reward/mean": 0.43518519401550293, "rewards/accuracy_reward/std": 0.4963560700416565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2126835286617279, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1158.9085693359375, "completions/mean_terminated_length": 913.2051391601562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3383943316818497, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12134259593881014, "kl": 0.0159912109375, "learning_rate": 8.53373992011326e-07, "loss": 0.08, "num_tokens": 931617584.0, "reward": 1.3861607313156128, "reward_std": 0.3382651209831238, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.21056649088859558, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1014.4241333007812, "completions/mean_terminated_length": 803.263427734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.33860742634915564, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.9260785552464367, "kl": 0.124237060546875, "learning_rate": 8.531266562036576e-07, "loss": 0.0619, "num_tokens": 932150094.0, "reward": 1.4888393878936768, "reward_std": 0.3059941530227661, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.194285050034523, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1115.732177734375, "completions/mean_terminated_length": 900.5934448242188, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.33882052101646154, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11824290626565354, "kl": 0.01837158203125, "learning_rate": 8.528791526385871e-07, "loss": 0.0451, "num_tokens": 932719062.0, "reward": 1.4648438692092896, "reward_std": 0.2916608452796936, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19759102165699005, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1055.044677734375, "completions/mean_terminated_length": 883.4869384765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3390336156837675, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12374272301341897, "kl": 0.01995849609375, "learning_rate": 8.526314814530892e-07, "loss": 0.0726, "num_tokens": 933263114.0, "reward": 1.4994419813156128, "reward_std": 0.2921116054058075, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17547869682312012, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1082.8013916015625, "completions/mean_terminated_length": 866.5546264648438, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.33924671035107345, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.34648282663667723, "kl": 0.03204345703125, "learning_rate": 8.523836427842306e-07, "loss": 0.1089, "num_tokens": 933821537.0, "reward": 1.4598214626312256, "reward_std": 0.33889105916023254, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18674945831298828, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1096.1607666015625, "completions/mean_terminated_length": 907.8289184570312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3394598050183794, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.332481468693029, "kl": 0.02239990234375, "learning_rate": 8.521356367691713e-07, "loss": 0.0718, "num_tokens": 934379849.0, "reward": 1.4843751192092896, "reward_std": 0.3701212704181671, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.19500340521335602, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 951.0826416015625, "completions/mean_terminated_length": 771.5869750976562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.33967289968568537, "frac_reward_zero_std": 0.0, "grad_norm": 0.12518180098565113, "kl": 0.018768310546875, "learning_rate": 8.518874635451635e-07, "loss": 0.0514, "num_tokens": 934871598.0, "reward": 1.6171876192092896, "reward_std": 0.3468630909919739, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1567714661359787, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1188.2679443359375, "completions/mean_terminated_length": 915.176513671875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3398859943529913, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14599288429524476, "kl": 0.017181396484375, "learning_rate": 8.516391232495522e-07, "loss": 0.0933, "num_tokens": 935476070.0, "reward": 1.2873884439468384, "reward_std": 0.36599746346473694, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.24354930222034454, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1187.5045166015625, "completions/mean_terminated_length": 940.2356567382812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3400990890202973, "frac_reward_zero_std": 0.0, "grad_norm": 0.12111080847835187, "kl": 0.0151824951171875, "learning_rate": 8.513906160197748e-07, "loss": 0.1035, "num_tokens": 936080808.0, "reward": 1.328125, "reward_std": 0.35253384709358215, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.4903542101383209, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.2222987711429596, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1010.2410888671875, "completions/mean_terminated_length": 843.5543823242188, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.34031218368760324, "frac_reward_zero_std": 0.0, "grad_norm": 0.13419886506615158, "kl": 0.017974853515625, "learning_rate": 8.511419419933606e-07, "loss": 0.1251, "num_tokens": 936596660.0, "reward": 1.5368304252624512, "reward_std": 0.3319348990917206, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17943671345710754, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1148.415283203125, "completions/mean_terminated_length": 928.5166625976562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.34052527835490914, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10581830156494289, "kl": 0.0153656005859375, "learning_rate": 8.508931013079322e-07, "loss": 0.042, "num_tokens": 937177262.0, "reward": 1.344866156578064, "reward_std": 0.37663909792900085, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134778022766, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.20580217242240906, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1068.32373046875, "completions/mean_terminated_length": 835.5828857421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3407383730222151, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10760653346547906, "kl": 0.016357421875, "learning_rate": 8.506440941012037e-07, "loss": 0.0583, "num_tokens": 937733695.0, "reward": 1.5139509439468384, "reward_std": 0.33474764227867126, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.1653849333524704, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 978.76123046875, "completions/mean_terminated_length": 770.615966796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34095146768952106, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12376230938122178, "kl": 0.0196533203125, "learning_rate": 8.503949205109813e-07, "loss": 0.0755, "num_tokens": 938238756.0, "reward": 1.501116156578064, "reward_std": 0.342297226190567, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19029554724693298, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 995.888427734375, "completions/mean_terminated_length": 787.7166137695312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.341164562356827, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12540608643126913, "kl": 0.021453857421875, "learning_rate": 8.501455806751638e-07, "loss": 0.0431, "num_tokens": 938749938.0, "reward": 1.544084906578064, "reward_std": 0.36465775966644287, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.18413659930229187, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1142.1585693359375, "completions/mean_terminated_length": 878.49853515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.341377657024133, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12874172540854856, "kl": 0.016998291015625, "learning_rate": 8.498960747317416e-07, "loss": 0.0915, "num_tokens": 939326281.0, "reward": 1.325334906578064, "reward_std": 0.30375322699546814, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18336889147758484, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1016.4107666015625, "completions/mean_terminated_length": 853.808837890625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.34159075169143893, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12827477825000136, "kl": 0.01812744140625, "learning_rate": 8.496464028187969e-07, "loss": 0.0836, "num_tokens": 939854353.0, "reward": 1.4743304252624512, "reward_std": 0.3306964635848999, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.20006877183914185, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1177.107177734375, "completions/mean_terminated_length": 900.4705810546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3418038463587449, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11432165066473485, "kl": 0.0154876708984375, "learning_rate": 8.493965650745043e-07, "loss": 0.0696, "num_tokens": 940446081.0, "reward": 1.3303571939468384, "reward_std": 0.34219592809677124, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.1949649602174759, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1133.685302734375, "completions/mean_terminated_length": 922.6895751953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.34201694102605085, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11167500738734885, "kl": 0.0158538818359375, "learning_rate": 8.491465616371299e-07, "loss": 0.0726, "num_tokens": 941029508.0, "reward": 1.4704241752624512, "reward_std": 0.3550918400287628, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.17041130363941193, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 999.4464721679688, "completions/mean_terminated_length": 798.6595458984375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.34223003569335675, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12552361769465137, "kl": 0.01715087890625, "learning_rate": 8.488963926450313e-07, "loss": 0.0742, "num_tokens": 941550700.0, "reward": 1.5630581378936768, "reward_std": 0.33897992968559265, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.18274828791618347, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1021.3817138671875, "completions/mean_terminated_length": 821.5333251953125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3424431303606627, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11984445150945572, "kl": 0.017578125, "learning_rate": 8.48646058236658e-07, "loss": 0.0874, "num_tokens": 942085079.0, "reward": 1.5736607313156128, "reward_std": 0.23639696836471558, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15519075095653534, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 940.0178833007812, "completions/mean_terminated_length": 758.711669921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.34265622502796866, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395942109562022, "kl": 0.02020263671875, "learning_rate": 8.483955585505507e-07, "loss": 0.1327, "num_tokens": 942573631.0, "reward": 1.5725446939468384, "reward_std": 0.3420998752117157, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18730680644512177, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1209.430908203125, "completions/mean_terminated_length": 933.2255249023438, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3428693196952746, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11163256036739098, "kl": 0.0169219970703125, "learning_rate": 8.48144893725342e-07, "loss": 0.0921, "num_tokens": 943187008.0, "reward": 1.450334906578064, "reward_std": 0.356228232383728, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.2339489907026291, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1088.6942138671875, "completions/mean_terminated_length": 833.9632568359375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3430824143625806, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12462191398307557, "kl": 0.0165863037109375, "learning_rate": 8.478940638997558e-07, "loss": 0.0651, "num_tokens": 943746711.0, "reward": 1.3945313692092896, "reward_std": 0.2974564731121063, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1695888340473175, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1024.5023193359375, "completions/mean_terminated_length": 825.2613525390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.34329550902988654, "frac_reward_zero_std": 0.0, "grad_norm": 0.14180287976384584, "kl": 0.018768310546875, "learning_rate": 8.47643069212607e-07, "loss": 0.129, "num_tokens": 944280568.0, "reward": 1.4592634439468384, "reward_std": 0.4094885587692261, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1929948478937149, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1156.529052734375, "completions/mean_terminated_length": 913.4005737304688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3435086036971925, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11693392577947447, "kl": 0.016357421875, "learning_rate": 8.473919098028021e-07, "loss": 0.0557, "num_tokens": 944869189.0, "reward": 1.3415179252624512, "reward_std": 0.2694803476333618, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.17691786587238312, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1170.77685546875, "completions/mean_terminated_length": 902.2390747070312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34372169836449845, "frac_reward_zero_std": 0.0, "grad_norm": 0.12202034254249197, "kl": 0.0159454345703125, "learning_rate": 8.471405858093385e-07, "loss": 0.1094, "num_tokens": 945461777.0, "reward": 1.3777902126312256, "reward_std": 0.3514552414417267, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9224330186843872, "rewards/tag_count_reward/std": 0.21863438189029694, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 966.763427734375, "completions/mean_terminated_length": 786.5573120117188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.34393479303180435, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13679831943531562, "kl": 0.01873779296875, "learning_rate": 8.468890973713048e-07, "loss": 0.0488, "num_tokens": 945960535.0, "reward": 1.469866156578064, "reward_std": 0.3222181499004364, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.1764904409646988, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1059.203125, "completions/mean_terminated_length": 860.3834228515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3441478876991103, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12369640650993691, "kl": 0.0165557861328125, "learning_rate": 8.466374446278806e-07, "loss": 0.0944, "num_tokens": 946501458.0, "reward": 1.4720982313156128, "reward_std": 0.29216450452804565, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.165329247713089, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1114.435302734375, "completions/mean_terminated_length": 882.994384765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34436098236641627, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12614838333385603, "kl": 0.018585205078125, "learning_rate": 8.463856277183366e-07, "loss": 0.0976, "num_tokens": 947062917.0, "reward": 1.4877232313156128, "reward_std": 0.3302699625492096, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20703594386577606, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1202.9598388671875, "completions/mean_terminated_length": 900.7938842773438, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3445740770337222, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10659987049189806, "kl": 0.016021728515625, "learning_rate": 8.461336467820339e-07, "loss": 0.044, "num_tokens": 947669523.0, "reward": 1.3582589626312256, "reward_std": 0.36030831933021545, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.20681875944137573, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1181.1295166015625, "completions/mean_terminated_length": 915.760986328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3447871717010282, "frac_reward_zero_std": 0.0, "grad_norm": 0.11785801678981878, "kl": 0.016998291015625, "learning_rate": 8.458815019584247e-07, "loss": 0.0947, "num_tokens": 948272365.0, "reward": 1.3247768878936768, "reward_std": 0.3536304831504822, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.2192423790693283, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1076.9285888671875, "completions/mean_terminated_length": 865.8261108398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.34500026636833414, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.31550087920220626, "kl": 0.027740478515625, "learning_rate": 8.456291933870521e-07, "loss": 0.088, "num_tokens": 948830349.0, "reward": 1.3984376192092896, "reward_std": 0.31362342834472656, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18730680644512177, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 974.622802734375, "completions/mean_terminated_length": 818.145751953125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3452133610356401, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12709461279539422, "kl": 0.018341064453125, "learning_rate": 8.453767212075495e-07, "loss": 0.0471, "num_tokens": 949327844.0, "reward": 1.5719866752624512, "reward_std": 0.3165784478187561, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17984239757061005, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1078.185302734375, "completions/mean_terminated_length": 841.1194458007812, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.34542645570294606, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11790167650458931, "kl": 0.01812744140625, "learning_rate": 8.451240855596409e-07, "loss": 0.0298, "num_tokens": 949876791.0, "reward": 1.4603794813156128, "reward_std": 0.32075875997543335, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20976383984088898, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1006.0178833007812, "completions/mean_terminated_length": 765.5604858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.34563955037025196, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13012699424904645, "kl": 0.01873779296875, "learning_rate": 8.448712865831405e-07, "loss": 0.0877, "num_tokens": 950393359.0, "reward": 1.4559152126312256, "reward_std": 0.2959991991519928, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.19338902831077576, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 972.1116333007812, "completions/mean_terminated_length": 812.1077270507812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3458526450375579, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11659833126967319, "kl": 0.01934814453125, "learning_rate": 8.446183244179537e-07, "loss": 0.0722, "num_tokens": 950897457.0, "reward": 1.5580357313156128, "reward_std": 0.3262375593185425, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.16415086388587952, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1201.3795166015625, "completions/mean_terminated_length": 964.32568359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3460657397048639, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11849761053249776, "kl": 0.0146636962890625, "learning_rate": 8.443651992040754e-07, "loss": 0.0797, "num_tokens": 951507995.0, "reward": 1.4095982313156128, "reward_std": 0.33879438042640686, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.1686781942844391, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 994.7879638671875, "completions/mean_terminated_length": 838.1564331054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.34627883437216983, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11993875597034388, "kl": 0.01995849609375, "learning_rate": 8.441119110815911e-07, "loss": 0.0465, "num_tokens": 952027740.0, "reward": 1.4531251192092896, "reward_std": 0.3466882109642029, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18240725994110107, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 964.8616333007812, "completions/mean_terminated_length": 794.1343994140625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3464919290394758, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14036370193297334, "kl": 0.02081298828125, "learning_rate": 8.438584601906763e-07, "loss": 0.0972, "num_tokens": 952533726.0, "reward": 1.4905134439468384, "reward_std": 0.27920711040496826, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.17873525619506836, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 987.6629638671875, "completions/mean_terminated_length": 807.710205078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34670502370678175, "frac_reward_zero_std": 0.0, "grad_norm": 0.16755385757276825, "kl": 0.023956298828125, "learning_rate": 8.436048466715968e-07, "loss": 0.087, "num_tokens": 953049479.0, "reward": 1.5234376192092896, "reward_std": 0.3229178786277771, "rewards/accuracy_reward/mean": 0.6087962985038757, "rewards/accuracy_reward/std": 0.4885856807231903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19961901009082794, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 992.9688110351562, "completions/mean_terminated_length": 817.1302490234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3469181183740877, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13232075696645468, "kl": 0.021514892578125, "learning_rate": 8.433510706647082e-07, "loss": 0.0593, "num_tokens": 953555241.0, "reward": 1.438616156578064, "reward_std": 0.32919150590896606, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.1796870082616806, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1112.4107666015625, "completions/mean_terminated_length": 833.0899047851562, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.34713121304139366, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1288337679320564, "kl": 0.0172119140625, "learning_rate": 8.43097132310456e-07, "loss": 0.1148, "num_tokens": 954124049.0, "reward": 1.4252232313156128, "reward_std": 0.39024457335472107, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9073660969734192, "rewards/tag_count_reward/std": 0.2436242550611496, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 940.5670166015625, "completions/mean_terminated_length": 788.7868041992188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34734430770869956, "frac_reward_zero_std": 0.0, "grad_norm": 0.14651077284284583, "kl": 0.02044677734375, "learning_rate": 8.428430317493758e-07, "loss": 0.1327, "num_tokens": 954616527.0, "reward": 1.4955357313156128, "reward_std": 0.3442634642124176, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.18987849354743958, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1108.390625, "completions/mean_terminated_length": 888.3718872070312, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3475574023760055, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.114830226473819, "kl": 0.0158538818359375, "learning_rate": 8.425887691220927e-07, "loss": 0.0743, "num_tokens": 955185582.0, "reward": 1.524553656578064, "reward_std": 0.38950827717781067, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17145662009716034, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 979.99560546875, "completions/mean_terminated_length": 722.609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3477704970433115, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12826343656744987, "kl": 0.021820068359375, "learning_rate": 8.423343445693217e-07, "loss": 0.0996, "num_tokens": 955694972.0, "reward": 1.594866156578064, "reward_std": 0.3238920569419861, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1879189908504486, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1118.935302734375, "completions/mean_terminated_length": 907.6685180664062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.34798359171061743, "frac_reward_zero_std": 0.0, "grad_norm": 0.12611364433179229, "kl": 0.017791748046875, "learning_rate": 8.420797582318672e-07, "loss": 0.0917, "num_tokens": 956278367.0, "reward": 1.4603794813156128, "reward_std": 0.30211108922958374, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.22847945988178253, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1056.9241943359375, "completions/mean_terminated_length": 828.2142944335938, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3481966863779234, "frac_reward_zero_std": 0.0, "grad_norm": 0.13416065233687174, "kl": 0.018402099609375, "learning_rate": 8.418250102506235e-07, "loss": 0.0843, "num_tokens": 956818829.0, "reward": 1.4408482313156128, "reward_std": 0.33602064847946167, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18046346306800842, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1151.872802734375, "completions/mean_terminated_length": 894.3649291992188, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.34840978104522935, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10417369791164842, "kl": 0.015472412109375, "learning_rate": 8.415701007665738e-07, "loss": 0.0473, "num_tokens": 957414372.0, "reward": 1.5368304252624512, "reward_std": 0.32307910919189453, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18642495572566986, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 974.3906860351562, "completions/mean_terminated_length": 817.8797607421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3486228757125353, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1435312812324333, "kl": 0.019500732421875, "learning_rate": 8.41315029920791e-07, "loss": 0.0753, "num_tokens": 957914643.0, "reward": 1.5033482313156128, "reward_std": 0.324045866727829, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1649361401796341, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1107.341552734375, "completions/mean_terminated_length": 893.4383544921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.34883597037984126, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11185849895635559, "kl": 0.01837158203125, "learning_rate": 8.410597978544375e-07, "loss": 0.1215, "num_tokens": 958481404.0, "reward": 1.4726563692092896, "reward_std": 0.35467612743377686, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.21838873624801636, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1000.310302734375, "completions/mean_terminated_length": 799.6887817382812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3490490650471472, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12394316537549191, "kl": 0.01983642578125, "learning_rate": 8.408044047087647e-07, "loss": 0.0865, "num_tokens": 958990087.0, "reward": 1.5195313692092896, "reward_std": 0.2774677574634552, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.16506759822368622, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1031.7991943359375, "completions/mean_terminated_length": 807.5149536132812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3492621597144531, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13214700511177013, "kl": 0.016876220703125, "learning_rate": 8.405488506251131e-07, "loss": 0.1135, "num_tokens": 959519341.0, "reward": 1.5000001192092896, "reward_std": 0.34166568517684937, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1928403526544571, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 956.6160888671875, "completions/mean_terminated_length": 781.3160400390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3494752543817591, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1455037968751677, "kl": 0.02197265625, "learning_rate": 8.402931357449121e-07, "loss": 0.0907, "num_tokens": 960015841.0, "reward": 1.4860491752624512, "reward_std": 0.29433223605155945, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.162506565451622, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 996.9888916015625, "completions/mean_terminated_length": 775.42431640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.34968834904906504, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12017265066066854, "kl": 0.016632080078125, "learning_rate": 8.400372602096807e-07, "loss": 0.067, "num_tokens": 960524332.0, "reward": 1.567522406578064, "reward_std": 0.3173326849937439, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15405942499637604, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1063.5023193359375, "completions/mean_terminated_length": 859.1724853515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.349901443716371, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12225836227539241, "kl": 0.018890380859375, "learning_rate": 8.397812241610261e-07, "loss": 0.0656, "num_tokens": 961069213.0, "reward": 1.4687501192092896, "reward_std": 0.3443000912666321, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.19134558737277985, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1042.122802734375, "completions/mean_terminated_length": 782.177001953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35011453838367695, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1278527087777152, "kl": 0.01812744140625, "learning_rate": 8.395250277406448e-07, "loss": 0.1023, "num_tokens": 961609684.0, "reward": 1.4095982313156128, "reward_std": 0.3001745641231537, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17418356239795685, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 957.08935546875, "completions/mean_terminated_length": 794.851318359375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3503276330509829, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12985906862358204, "kl": 0.018646240234375, "learning_rate": 8.392686710903221e-07, "loss": 0.0373, "num_tokens": 962107820.0, "reward": 1.5390626192092896, "reward_std": 0.24022845923900604, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14819122850894928, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1110.919677734375, "completions/mean_terminated_length": 802.2670288085938, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.35054072771828887, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.48295840111489974, "kl": 0.025054931640625, "learning_rate": 8.390121543519313e-07, "loss": 0.1008, "num_tokens": 962675768.0, "reward": 1.4681919813156128, "reward_std": 0.31022509932518005, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.17285525798797607, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 822.3683471679688, "completions/mean_terminated_length": 712.0316162109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3507538223855948, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1413350541777721, "kl": 0.020111083984375, "learning_rate": 8.387554776674352e-07, "loss": 0.1297, "num_tokens": 963114381.0, "reward": 1.7181921005249023, "reward_std": 0.3322814702987671, "rewards/accuracy_reward/mean": 0.7544642686843872, "rewards/accuracy_reward/std": 0.43088552355766296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.1416827142238617, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 971.44873046875, "completions/mean_terminated_length": 801.7597045898438, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.35096691705290073, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12906559349788407, "kl": 0.0170745849609375, "learning_rate": 8.384986411788846e-07, "loss": 0.0665, "num_tokens": 963621590.0, "reward": 1.5256696939468384, "reward_std": 0.32936400175094604, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.12144903838634491, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1183.0826416015625, "completions/mean_terminated_length": 950.314453125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3511800117202067, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11762234638480969, "kl": 0.014984130859375, "learning_rate": 8.382416450284186e-07, "loss": 0.0504, "num_tokens": 964220331.0, "reward": 1.4051339626312256, "reward_std": 0.3253902494907379, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17050378024578094, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1008.7098388671875, "completions/mean_terminated_length": 806.3946533203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.35139310638751264, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1324159762240552, "kl": 0.01788330078125, "learning_rate": 8.379844893582653e-07, "loss": 0.1185, "num_tokens": 964734841.0, "reward": 1.4001116752624512, "reward_std": 0.3016037940979004, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17976601421833038, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1136.5982666015625, "completions/mean_terminated_length": 861.0581665039062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3516062010548186, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11354181730812309, "kl": 0.0147705078125, "learning_rate": 8.377271743107403e-07, "loss": 0.0811, "num_tokens": 965325669.0, "reward": 1.4213169813156128, "reward_std": 0.33992189168930054, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18230371177196503, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 968.9620971679688, "completions/mean_terminated_length": 792.3922119140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35181929572212456, "frac_reward_zero_std": 0.0, "grad_norm": 0.2179572642136493, "kl": 0.02227783203125, "learning_rate": 8.374697000282481e-07, "loss": 0.065, "num_tokens": 965828292.0, "reward": 1.4095982313156128, "reward_std": 0.33641761541366577, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20094045996665955, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1018.8638916015625, "completions/mean_terminated_length": 808.6102294921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3520323903894305, "frac_reward_zero_std": 0.0, "grad_norm": 0.12762655299513068, "kl": 0.0177001953125, "learning_rate": 8.372120666532808e-07, "loss": 0.0698, "num_tokens": 966350871.0, "reward": 1.5468751192092896, "reward_std": 0.3452845513820648, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.14789186418056488, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 967.5357666015625, "completions/mean_terminated_length": 816.32568359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.3522454850567365, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.125028850924469, "kl": 0.0203857421875, "learning_rate": 8.36954274328419e-07, "loss": 0.0189, "num_tokens": 966852951.0, "reward": 1.4949777126312256, "reward_std": 0.33132684230804443, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.17236346006393433, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1154.560302734375, "completions/mean_terminated_length": 874.2140502929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.35245857972404243, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12200378824755592, "kl": 0.01568603515625, "learning_rate": 8.366963231963306e-07, "loss": 0.0864, "num_tokens": 967441506.0, "reward": 1.3470982313156128, "reward_std": 0.2944601774215698, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.19318638741970062, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 893.2031860351562, "completions/mean_terminated_length": 711.180908203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35267167439134833, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1441753126497566, "kl": 0.02227783203125, "learning_rate": 8.364382133997722e-07, "loss": 0.1076, "num_tokens": 967906077.0, "reward": 1.508928656578064, "reward_std": 0.3505012094974518, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20760497450828552, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1043.3929443359375, "completions/mean_terminated_length": 847.829345703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3528847690586543, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12115994017717166, "kl": 0.019378662109375, "learning_rate": 8.361799450815875e-07, "loss": 0.0811, "num_tokens": 968446749.0, "reward": 1.5680804252624512, "reward_std": 0.31966859102249146, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18280036747455597, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1042.484375, "completions/mean_terminated_length": 803.60498046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.35309786372596025, "frac_reward_zero_std": 0.0, "grad_norm": 0.13321623980124786, "kl": 0.01904296875, "learning_rate": 8.359215183847086e-07, "loss": 0.0663, "num_tokens": 968978134.0, "reward": 1.5005581378936768, "reward_std": 0.2845189571380615, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.1692424863576889, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1129.29248046875, "completions/mean_terminated_length": 861.8875732421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3533109583932662, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13073354791989578, "kl": 0.016571044921875, "learning_rate": 8.356629334521545e-07, "loss": 0.0521, "num_tokens": 969561689.0, "reward": 1.3052456378936768, "reward_std": 0.3249323070049286, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.22644878923892975, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 987.4464721679688, "completions/mean_terminated_length": 807.4569091796875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.35352405306057216, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13731001616266328, "kl": 0.0172882080078125, "learning_rate": 8.354041904270324e-07, "loss": 0.1203, "num_tokens": 970076385.0, "reward": 1.5106027126312256, "reward_std": 0.3532072603702545, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.2085641622543335, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1098.640625, "completions/mean_terminated_length": 829.338134765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3537371477278781, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1409123868168773, "kl": 0.0172119140625, "learning_rate": 8.351452894525368e-07, "loss": 0.1014, "num_tokens": 970635440.0, "reward": 1.3867188692092896, "reward_std": 0.3475736975669861, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9246651530265808, "rewards/tag_count_reward/std": 0.22131875157356262, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 971.0938110351562, "completions/mean_terminated_length": 785.0314331054688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3539502423951841, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13482737552091728, "kl": 0.018646240234375, "learning_rate": 8.348862306719495e-07, "loss": 0.1056, "num_tokens": 971141210.0, "reward": 1.5055804252624512, "reward_std": 0.3174375295639038, "rewards/accuracy_reward/mean": 0.5902777910232544, "rewards/accuracy_reward/std": 0.49235257506370544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.17736537754535675, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1097.1451416015625, "completions/mean_terminated_length": 871.2514038085938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.35416333706249004, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12153666244557827, "kl": 0.01702880859375, "learning_rate": 8.346270142286397e-07, "loss": 0.088, "num_tokens": 971703307.0, "reward": 1.3431919813156128, "reward_std": 0.3357907831668854, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.16183683276176453, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1014.1004638671875, "completions/mean_terminated_length": 772.0027465820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35437643172979594, "frac_reward_zero_std": 0.0, "grad_norm": 0.12967449414124, "kl": 0.019866943359375, "learning_rate": 8.343676402660638e-07, "loss": 0.081, "num_tokens": 972224392.0, "reward": 1.536272406578064, "reward_std": 0.3587360978126526, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.19454751908779144, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1022.0625610351562, "completions/mean_terminated_length": 884.4050903320312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3545895263971019, "frac_reward_zero_std": 0.0, "grad_norm": 0.11471894006231183, "kl": 0.0185546875, "learning_rate": 8.341081089277655e-07, "loss": 0.049, "num_tokens": 972753876.0, "reward": 1.4285714626312256, "reward_std": 0.36653241515159607, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.18907469511032104, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1111.622802734375, "completions/mean_terminated_length": 882.7305908203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35480262106440785, "frac_reward_zero_std": 0.0, "grad_norm": 0.13530110386335292, "kl": 0.01776123046875, "learning_rate": 8.338484203573756e-07, "loss": 0.0968, "num_tokens": 973325467.0, "reward": 1.3510044813156128, "reward_std": 0.3596004843711853, "rewards/accuracy_reward/mean": 0.49038460850715637, "rewards/accuracy_reward/std": 0.50050950050354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8956473469734192, "rewards/tag_count_reward/std": 0.25165674090385437, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1180.540283203125, "completions/mean_terminated_length": 887.934326171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3550157157317138, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1116113699141581, "kl": 0.0158538818359375, "learning_rate": 8.335885746986118e-07, "loss": 0.0546, "num_tokens": 973926493.0, "reward": 1.208147406578064, "reward_std": 0.32724109292030334, "rewards/accuracy_reward/mean": 0.2901785671710968, "rewards/accuracy_reward/std": 0.4543520212173462, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.22438858449459076, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1017.5714721679688, "completions/mean_terminated_length": 796.9647827148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35522881039901977, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13657784015413618, "kl": 0.01800537109375, "learning_rate": 8.333285720952787e-07, "loss": 0.0385, "num_tokens": 974448253.0, "reward": 1.4068081378936768, "reward_std": 0.3253634572029114, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.17820364236831665, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1078.9732666015625, "completions/mean_terminated_length": 848.762451171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3554419050663257, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11625420033079578, "kl": 0.01666259765625, "learning_rate": 8.330684126912679e-07, "loss": 0.0719, "num_tokens": 974993665.0, "reward": 1.5848214626312256, "reward_std": 0.3165508806705475, "rewards/accuracy_reward/mean": 0.6629464030265808, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20557457208633423, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1159.466552734375, "completions/mean_terminated_length": 887.4664916992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3556549997336317, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1171302782261919, "kl": 0.0158233642578125, "learning_rate": 8.328080966305577e-07, "loss": 0.0569, "num_tokens": 975587218.0, "reward": 1.4012277126312256, "reward_std": 0.3639538884162903, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.18292580544948578, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1000.7567138671875, "completions/mean_terminated_length": 779.9865112304688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.35586809440093764, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13116914093618792, "kl": 0.018829345703125, "learning_rate": 8.325476240572131e-07, "loss": 0.0852, "num_tokens": 976104261.0, "reward": 1.4754464626312256, "reward_std": 0.3803035318851471, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.22219766676425934, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1208.125, "completions/mean_terminated_length": 904.3404541015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.35608118906824354, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12044837001720093, "kl": 0.0179901123046875, "learning_rate": 8.322869951153859e-07, "loss": 0.0858, "num_tokens": 976719853.0, "reward": 1.3537946939468384, "reward_std": 0.31160521507263184, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22028762102127075, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 948.872802734375, "completions/mean_terminated_length": 772.3289794921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3562942837355495, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3304440682812741, "kl": 0.0545654296875, "learning_rate": 8.32026209949314e-07, "loss": 0.0757, "num_tokens": 977210500.0, "reward": 1.5669643878936768, "reward_std": 0.3016761243343353, "rewards/accuracy_reward/mean": 0.6759259104728699, "rewards/accuracy_reward/std": 0.4685704708099365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.2324453443288803, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1168.4554443359375, "completions/mean_terminated_length": 835.58154296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35650737840285546, "frac_reward_zero_std": 0.0, "grad_norm": 0.14941878690052934, "kl": 0.01751708984375, "learning_rate": 8.317652687033223e-07, "loss": 0.0628, "num_tokens": 977807296.0, "reward": 1.3297991752624512, "reward_std": 0.42319467663764954, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9168526530265808, "rewards/tag_count_reward/std": 0.22850678861141205, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1031.2835693359375, "completions/mean_terminated_length": 823.5671997070312, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3567204730701614, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12458043412497996, "kl": 0.01824951171875, "learning_rate": 8.315041715218216e-07, "loss": 0.084, "num_tokens": 978340655.0, "reward": 1.5491071939468384, "reward_std": 0.3681546747684479, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.19925907254219055, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1069.8795166015625, "completions/mean_terminated_length": 876.3475952148438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.35693356773746737, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12201868729627982, "kl": 0.01983642578125, "learning_rate": 8.312429185493091e-07, "loss": 0.0659, "num_tokens": 978886969.0, "reward": 1.450334906578064, "reward_std": 0.3468681573867798, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.18704918026924133, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 988.46435546875, "completions/mean_terminated_length": 821.4573974609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.35714666240477333, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13102302072404803, "kl": 0.01837158203125, "learning_rate": 8.309815099303687e-07, "loss": 0.0715, "num_tokens": 979397161.0, "reward": 1.5262277126312256, "reward_std": 0.26391923427581787, "rewards/accuracy_reward/mean": 0.6105769276618958, "rewards/accuracy_reward/std": 0.4882066547870636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15467405319213867, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1124.634033203125, "completions/mean_terminated_length": 895.721435546875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3573597570720793, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11683748049961222, "kl": 0.01702880859375, "learning_rate": 8.307199458096699e-07, "loss": 0.0589, "num_tokens": 979972085.0, "reward": 1.4654018878936768, "reward_std": 0.37644821405410767, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.207614004611969, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1164.34375, "completions/mean_terminated_length": 916.9199829101562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.35757285173938524, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12329211528085454, "kl": 0.017242431640625, "learning_rate": 8.304582263319683e-07, "loss": 0.0367, "num_tokens": 980570479.0, "reward": 1.3392857313156128, "reward_std": 0.3130727708339691, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.19925905764102936, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1121.1116943359375, "completions/mean_terminated_length": 884.845947265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35778594640669115, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12438145003668408, "kl": 0.0177001953125, "learning_rate": 8.301963516421055e-07, "loss": 0.0615, "num_tokens": 981139537.0, "reward": 1.5239956378936768, "reward_std": 0.25324687361717224, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.1899302750825882, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1063.4732666015625, "completions/mean_terminated_length": 836.2747192382812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3579990410739971, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13310002736889023, "kl": 0.018310546875, "learning_rate": 8.299343218850094e-07, "loss": 0.0865, "num_tokens": 981680629.0, "reward": 1.5145089626312256, "reward_std": 0.34273067116737366, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.2021544873714447, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1086.01123046875, "completions/mean_terminated_length": 864.0137329101562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.35821213574130306, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11457330950328799, "kl": 0.018951416015625, "learning_rate": 8.296721372056933e-07, "loss": 0.0597, "num_tokens": 982238202.0, "reward": 1.4944196939468384, "reward_std": 0.3844994902610779, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20643207430839539, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1095.4129638671875, "completions/mean_terminated_length": 900.7984008789062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.358425230408609, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11919204974769292, "kl": 0.018768310546875, "learning_rate": 8.294097977492564e-07, "loss": 0.0808, "num_tokens": 982792051.0, "reward": 1.5424107313156128, "reward_std": 0.37556812167167664, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19573186337947845, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1067.4910888671875, "completions/mean_terminated_length": 888.9815673828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.358638325075915, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12468285425006967, "kl": 0.01904296875, "learning_rate": 8.291473036608834e-07, "loss": 0.1023, "num_tokens": 983338127.0, "reward": 1.5446429252624512, "reward_std": 0.33349454402923584, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19357694685459137, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1051.602783203125, "completions/mean_terminated_length": 818.2864990234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35885141974322093, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12461767786802058, "kl": 0.019989013671875, "learning_rate": 8.288846550858446e-07, "loss": 0.0632, "num_tokens": 983873693.0, "reward": 1.583147406578064, "reward_std": 0.3163088858127594, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16086193919181824, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1087.5513916015625, "completions/mean_terminated_length": 872.3688354492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3590645144105269, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12588477808091145, "kl": 0.01849365234375, "learning_rate": 8.286218521694961e-07, "loss": 0.1261, "num_tokens": 984434724.0, "reward": 1.4514509439468384, "reward_std": 0.3659247159957886, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.20658548176288605, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1075.4554443359375, "completions/mean_terminated_length": 803.142822265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.35927760907783285, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13375983502899372, "kl": 0.01934814453125, "learning_rate": 8.283588950572791e-07, "loss": 0.1389, "num_tokens": 984987904.0, "reward": 1.4335938692092896, "reward_std": 0.34458714723587036, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.24595172703266144, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 984.7522583007812, "completions/mean_terminated_length": 767.529541015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35949070374513875, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11898711831861983, "kl": 0.01959228515625, "learning_rate": 8.280957838947204e-07, "loss": 0.0767, "num_tokens": 985495457.0, "reward": 1.5329241752624512, "reward_std": 0.28722572326660156, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.1839466691017151, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1081.9754638671875, "completions/mean_terminated_length": 855.7713623046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3597037984124447, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.126314108261914, "kl": 0.017425537109375, "learning_rate": 8.278325188274316e-07, "loss": 0.0915, "num_tokens": 986048118.0, "reward": 1.4280134439468384, "reward_std": 0.3145117163658142, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18318496644496918, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1067.1629638671875, "completions/mean_terminated_length": 866.77685546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.35991689307975067, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13189044116987939, "kl": 0.020355224609375, "learning_rate": 8.275691000011098e-07, "loss": 0.0928, "num_tokens": 986594319.0, "reward": 1.559709906578064, "reward_std": 0.2899355888366699, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.15902772545814514, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1011.482177734375, "completions/mean_terminated_length": 789.5718383789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3601299877470566, "frac_reward_zero_std": 0.0, "grad_norm": 0.18035025318004214, "kl": 0.022674560546875, "learning_rate": 8.273055275615374e-07, "loss": 0.1362, "num_tokens": 987115751.0, "reward": 1.4871652126312256, "reward_std": 0.3944976031780243, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20367643237113953, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1078.352783203125, "completions/mean_terminated_length": 892.6754760742188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3603430824143626, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1190019355143982, "kl": 0.017578125, "learning_rate": 8.270418016545812e-07, "loss": 0.102, "num_tokens": 987662101.0, "reward": 1.5463169813156128, "reward_std": 0.362278014421463, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2172366976737976, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1040.546875, "completions/mean_terminated_length": 828.1648559570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36055617708166854, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1212077769971808, "kl": 0.017608642578125, "learning_rate": 8.267779224261939e-07, "loss": 0.0307, "num_tokens": 988201546.0, "reward": 1.5541294813156128, "reward_std": 0.2592855393886566, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1395251452922821, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1076.669677734375, "completions/mean_terminated_length": 829.0756225585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3607692717489745, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1239379273978354, "kl": 0.01617431640625, "learning_rate": 8.265138900224117e-07, "loss": 0.0371, "num_tokens": 988753222.0, "reward": 1.4614956378936768, "reward_std": 0.3413955867290497, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1752651333808899, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1032.5357666015625, "completions/mean_terminated_length": 828.3539428710938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.36098236641628045, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13141921902148235, "kl": 0.018280029296875, "learning_rate": 8.262497045893569e-07, "loss": 0.0771, "num_tokens": 989296230.0, "reward": 1.5072544813156128, "reward_std": 0.3216952085494995, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.1573467254638672, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1176.7054443359375, "completions/mean_terminated_length": 879.3173828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.36119546108358636, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11074433099547726, "kl": 0.016693115234375, "learning_rate": 8.259853662732358e-07, "loss": 0.0801, "num_tokens": 989893906.0, "reward": 1.3850446939468384, "reward_std": 0.3302210867404938, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18940123915672302, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1162.078125, "completions/mean_terminated_length": 877.2241821289062, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.3614085557508923, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11806739774367307, "kl": 0.01617431640625, "learning_rate": 8.257208752203392e-07, "loss": 0.0752, "num_tokens": 990479589.0, "reward": 1.4330357313156128, "reward_std": 0.28618568181991577, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14133092761039734, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1082.4263916015625, "completions/mean_terminated_length": 836.2997436523438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.36162165041819827, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1273603523245905, "kl": 0.017120361328125, "learning_rate": 8.254562315770428e-07, "loss": 0.0899, "num_tokens": 991027796.0, "reward": 1.5083706378936768, "reward_std": 0.35290732979774475, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18088065087795258, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1055.5804443359375, "completions/mean_terminated_length": 849.6064453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.36183474508550423, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1321313240164551, "kl": 0.0201416015625, "learning_rate": 8.251914354898067e-07, "loss": 0.1121, "num_tokens": 991567784.0, "reward": 1.4704241752624512, "reward_std": 0.3289593458175659, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.18413659930229187, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1060.546875, "completions/mean_terminated_length": 832.673095703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3620478397528102, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13345264592004155, "kl": 0.0173492431640625, "learning_rate": 8.249264871051751e-07, "loss": 0.0804, "num_tokens": 992116365.0, "reward": 1.4129464626312256, "reward_std": 0.36945098638534546, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16981780529022217, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 977.51123046875, "completions/mean_terminated_length": 827.6972045898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.36226093442011614, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 9.959927533064949, "kl": 0.44854736328125, "learning_rate": 8.246613865697767e-07, "loss": 0.0959, "num_tokens": 992620642.0, "reward": 1.594866156578064, "reward_std": 0.2843775153160095, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17070865631103516, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1151.84375, "completions/mean_terminated_length": 853.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.3624740290874221, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11214373283501447, "kl": 0.0160675048828125, "learning_rate": 8.243961340303245e-07, "loss": 0.0532, "num_tokens": 993214620.0, "reward": 1.3989956378936768, "reward_std": 0.3131371736526489, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9213169813156128, "rewards/tag_count_reward/std": 0.2294771820306778, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1057.7076416015625, "completions/mean_terminated_length": 864.9306640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.36268712375472806, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12338856575292699, "kl": 0.0188140869140625, "learning_rate": 8.241307296336151e-07, "loss": 0.1174, "num_tokens": 993761689.0, "reward": 1.5373884439468384, "reward_std": 0.38775938749313354, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17854657769203186, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1098.8348388671875, "completions/mean_terminated_length": 853.5449829101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.362900218422034, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1312230965107301, "kl": 0.017120361328125, "learning_rate": 8.238651735265298e-07, "loss": 0.092, "num_tokens": 994324191.0, "reward": 1.4235491752624512, "reward_std": 0.3866247534751892, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19661563634872437, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 971.4844360351562, "completions/mean_terminated_length": 792.0651245117188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3631133130893399, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1186374845999787, "kl": 0.02142333984375, "learning_rate": 8.235994658560338e-07, "loss": 0.0522, "num_tokens": 994832360.0, "reward": 1.5145089626312256, "reward_std": 0.2828209102153778, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18046344816684723, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 912.2188110351562, "completions/mean_terminated_length": 749.9642944335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3633264077566459, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11737759474799048, "kl": 0.021728515625, "learning_rate": 8.233336067691755e-07, "loss": 0.0756, "num_tokens": 995308314.0, "reward": 1.6032366752624512, "reward_std": 0.32066306471824646, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1649162620306015, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1143.41748046875, "completions/mean_terminated_length": 912.8375854492188, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.36353950242395183, "frac_reward_zero_std": 0.0, "grad_norm": 0.11893022266779026, "kl": 0.0166778564453125, "learning_rate": 8.230675964130879e-07, "loss": 0.0462, "num_tokens": 995884501.0, "reward": 1.4709821939468384, "reward_std": 0.3447098135948181, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.16584569215774536, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1055.575927734375, "completions/mean_terminated_length": 881.0551147460938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3637525970912578, "frac_reward_zero_std": 0.0, "grad_norm": 0.11985776270875176, "kl": 0.017333984375, "learning_rate": 8.228014349349872e-07, "loss": 0.1165, "num_tokens": 996430519.0, "reward": 1.5770089626312256, "reward_std": 0.36568358540534973, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1748131364583969, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1238.6295166015625, "completions/mean_terminated_length": 942.5182495117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36396569175856375, "frac_reward_zero_std": 0.0, "grad_norm": 0.11076538293855635, "kl": 0.0143890380859375, "learning_rate": 8.225351224821733e-07, "loss": 0.0518, "num_tokens": 997061777.0, "reward": 1.4123884439468384, "reward_std": 0.3476875424385071, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.20359058678150177, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1001.6339721679688, "completions/mean_terminated_length": 763.6931762695312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3641787864258697, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1212053662061018, "kl": 0.019683837890625, "learning_rate": 8.222686592020298e-07, "loss": 0.0839, "num_tokens": 997577533.0, "reward": 1.4827009439468384, "reward_std": 0.3263048231601715, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1681026816368103, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1066.13623046875, "completions/mean_terminated_length": 832.875732421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.36439188109317566, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11763812531412161, "kl": 0.01885986328125, "learning_rate": 8.220020452420241e-07, "loss": 0.0693, "num_tokens": 998121146.0, "reward": 1.4603794813156128, "reward_std": 0.31841525435447693, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.17382751405239105, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1117.091552734375, "completions/mean_terminated_length": 879.8011474609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3646049757604816, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12729011810206955, "kl": 0.0177001953125, "learning_rate": 8.217352807497062e-07, "loss": 0.0991, "num_tokens": 998689651.0, "reward": 1.5524554252624512, "reward_std": 0.3326716125011444, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18032504618167877, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1031.5535888671875, "completions/mean_terminated_length": 783.0889282226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3648180704277875, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2462535373934271, "kl": 0.0220947265625, "learning_rate": 8.2146836587271e-07, "loss": 0.0574, "num_tokens": 999220683.0, "reward": 1.5189732313156128, "reward_std": 0.28955236077308655, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1880783587694168, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1064.977783203125, "completions/mean_terminated_length": 817.84912109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3650311650950935, "frac_reward_zero_std": 0.0, "grad_norm": 0.11847887395497089, "kl": 0.0179290771484375, "learning_rate": 8.212013007587524e-07, "loss": 0.0395, "num_tokens": 999766369.0, "reward": 1.5189732313156128, "reward_std": 0.37035804986953735, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16426870226860046, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1127.96435546875, "completions/mean_terminated_length": 918.750732421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.36524425976239944, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11784266280503311, "kl": 0.01812744140625, "learning_rate": 8.209340855556336e-07, "loss": 0.0813, "num_tokens": 1000335969.0, "reward": 1.3900669813156128, "reward_std": 0.33505529165267944, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20008359849452972, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1093.328125, "completions/mean_terminated_length": 839.8276977539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3654573544297054, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11780569090551464, "kl": 0.017791748046875, "learning_rate": 8.206667204112366e-07, "loss": 0.0652, "num_tokens": 1000895284.0, "reward": 1.4118304252624512, "reward_std": 0.3445415794849396, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18040810525417328, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1115.279052734375, "completions/mean_terminated_length": 900.0357666015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36567044909701135, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11439541125913491, "kl": 0.01837158203125, "learning_rate": 8.203992054735276e-07, "loss": 0.0753, "num_tokens": 1001465809.0, "reward": 1.4475446939468384, "reward_std": 0.36878278851509094, "rewards/accuracy_reward/mean": 0.5532407164573669, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2332361787557602, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1121.8304443359375, "completions/mean_terminated_length": 824.035400390625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3658835437643173, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1198601269318102, "kl": 0.0179290771484375, "learning_rate": 8.201315408905557e-07, "loss": 0.1124, "num_tokens": 1002040053.0, "reward": 1.4430804252624512, "reward_std": 0.32946091890335083, "rewards/accuracy_reward/mean": 0.5578703880310059, "rewards/accuracy_reward/std": 0.49721553921699524, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9051339030265808, "rewards/tag_count_reward/std": 0.23752164840698242, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1089.118408203125, "completions/mean_terminated_length": 831.0623168945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36609663843162327, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13048665007835744, "kl": 0.019439697265625, "learning_rate": 8.198637268104528e-07, "loss": 0.0799, "num_tokens": 1002594218.0, "reward": 1.5345982313156128, "reward_std": 0.36409419775009155, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22654591500759125, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1164.3616943359375, "completions/mean_terminated_length": 920.1652221679688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3663097330989292, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.125473116194484, "kl": 0.0157012939453125, "learning_rate": 8.195957633814333e-07, "loss": 0.0494, "num_tokens": 1003192492.0, "reward": 1.3152902126312256, "reward_std": 0.351897656917572, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2159051150083542, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1134.8170166015625, "completions/mean_terminated_length": 933.2697143554688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3665228277662351, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11461254218935943, "kl": 0.020355224609375, "learning_rate": 8.193276507517946e-07, "loss": 0.0582, "num_tokens": 1003765658.0, "reward": 1.4849331378936768, "reward_std": 0.33629924058914185, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.18820028007030487, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1082.915283203125, "completions/mean_terminated_length": 840.2960815429688, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3667359224335411, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12126027271747143, "kl": 0.018096923828125, "learning_rate": 8.190593890699165e-07, "loss": 0.0625, "num_tokens": 1004315316.0, "reward": 1.4720982313156128, "reward_std": 0.32021471858024597, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.1661728024482727, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1124.5692138671875, "completions/mean_terminated_length": 920.7601928710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36694901710084704, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11582499865810182, "kl": 0.01727294921875, "learning_rate": 8.187909784842612e-07, "loss": 0.0508, "num_tokens": 1004891891.0, "reward": 1.3515626192092896, "reward_std": 0.28647276759147644, "rewards/accuracy_reward/mean": 0.41203704476356506, "rewards/accuracy_reward/std": 0.4927724003791809, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1625574380159378, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1146.5692138671875, "completions/mean_terminated_length": 846.09228515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.367162111768153, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12354799408436021, "kl": 0.0159149169921875, "learning_rate": 8.185224191433738e-07, "loss": 0.0794, "num_tokens": 1005473714.0, "reward": 1.372209906578064, "reward_std": 0.33468201756477356, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17627368867397308, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1107.3504638671875, "completions/mean_terminated_length": 843.9685668945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36737520643545896, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12316917774905219, "kl": 0.018524169921875, "learning_rate": 8.182537111958807e-07, "loss": 0.0876, "num_tokens": 1006036127.0, "reward": 1.4196429252624512, "reward_std": 0.36146289110183716, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23967991769313812, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1018.7857666015625, "completions/mean_terminated_length": 798.4390258789062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3675883011027649, "frac_reward_zero_std": 0.0, "grad_norm": 0.13143077450399732, "kl": 0.0181884765625, "learning_rate": 8.179848547904916e-07, "loss": 0.1183, "num_tokens": 1006558303.0, "reward": 1.3850446939468384, "reward_std": 0.3376026749610901, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16994640231132507, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 987.27685546875, "completions/mean_terminated_length": 810.4896240234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.36780139577007087, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1332942382157942, "kl": 0.020782470703125, "learning_rate": 8.177158500759979e-07, "loss": 0.0697, "num_tokens": 1007066363.0, "reward": 1.5535714626312256, "reward_std": 0.33747413754463196, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1639530062675476, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1073.477783203125, "completions/mean_terminated_length": 841.9613647460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.36801449043737683, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1217144579086224, "kl": 0.0186767578125, "learning_rate": 8.174466972012731e-07, "loss": 0.0807, "num_tokens": 1007620433.0, "reward": 1.4988839626312256, "reward_std": 0.30491694808006287, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16578169167041779, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1027.8348388671875, "completions/mean_terminated_length": 809.4254760742188, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.36822758510468273, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12255101277995828, "kl": 0.020050048828125, "learning_rate": 8.171773963152728e-07, "loss": 0.1073, "num_tokens": 1008144727.0, "reward": 1.5223214626312256, "reward_std": 0.3487630784511566, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.2026386857032776, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1037.109375, "completions/mean_terminated_length": 859.3411865234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.3684406797719887, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13037602065967266, "kl": 0.01898193359375, "learning_rate": 8.169079475670342e-07, "loss": 0.0905, "num_tokens": 1008679160.0, "reward": 1.469866156578064, "reward_std": 0.37427228689193726, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.21069389581680298, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 918.7879638671875, "completions/mean_terminated_length": 801.972900390625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.36865377443929465, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13342643249824107, "kl": 0.020782470703125, "learning_rate": 8.166383511056767e-07, "loss": 0.1379, "num_tokens": 1009157369.0, "reward": 1.661272406578064, "reward_std": 0.34895122051239014, "rewards/accuracy_reward/mean": 0.7232142686843872, "rewards/accuracy_reward/std": 0.44790980219841003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19121423363685608, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1138.040283203125, "completions/mean_terminated_length": 879.9140625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3688668691066006, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11566171433185084, "kl": 0.01776123046875, "learning_rate": 8.163686070804013e-07, "loss": 0.0872, "num_tokens": 1009742107.0, "reward": 1.5212054252624512, "reward_std": 0.31370410323143005, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.2049998939037323, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 982.1004638671875, "completions/mean_terminated_length": 771.2005615234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.36907996377390656, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13021953861724425, "kl": 0.019744873046875, "learning_rate": 8.160987156404907e-07, "loss": 0.0556, "num_tokens": 1010255512.0, "reward": 1.5351563692092896, "reward_std": 0.28683602809906006, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17025740444660187, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1093.0648193359375, "completions/mean_terminated_length": 839.4943237304688, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.3692930584412125, "frac_reward_zero_std": 0.0, "grad_norm": 0.12509578018928036, "kl": 0.01806640625, "learning_rate": 8.158286769353091e-07, "loss": 0.0747, "num_tokens": 1010808821.0, "reward": 1.411272406578064, "reward_std": 0.3108561635017395, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.1990012526512146, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1122.375, "completions/mean_terminated_length": 852.9567260742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3695061531085185, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.54854668617899, "kl": 0.026611328125, "learning_rate": 8.155584911143022e-07, "loss": 0.0888, "num_tokens": 1011383677.0, "reward": 1.3950893878936768, "reward_std": 0.348760724067688, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.21133582293987274, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 980.3058471679688, "completions/mean_terminated_length": 772.4613037109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.36971924777582443, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14154967962596304, "kl": 0.021697998046875, "learning_rate": 8.152881583269973e-07, "loss": 0.0817, "num_tokens": 1011891190.0, "reward": 1.5446429252624512, "reward_std": 0.34302085638046265, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18769630789756775, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 979.9308471679688, "completions/mean_terminated_length": 811.578857421875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.36993234244313034, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11742019174956134, "kl": 0.01983642578125, "learning_rate": 8.15017678723003e-07, "loss": 0.0509, "num_tokens": 1012394775.0, "reward": 1.5122768878936768, "reward_std": 0.27459022402763367, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13944123685359955, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1066.28125, "completions/mean_terminated_length": 794.9800415039062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3701454371104363, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12609670433142364, "kl": 0.01702880859375, "learning_rate": 8.147470524520086e-07, "loss": 0.0533, "num_tokens": 1012935653.0, "reward": 1.5574777126312256, "reward_std": 0.2559746205806732, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16314294934272766, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1154.747802734375, "completions/mean_terminated_length": 891.4190673828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37035853177774225, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1144801754671953, "kl": 0.014862060546875, "learning_rate": 8.144762796637854e-07, "loss": 0.0837, "num_tokens": 1013520980.0, "reward": 1.450334906578064, "reward_std": 0.3751393258571625, "rewards/accuracy_reward/mean": 0.5439814925193787, "rewards/accuracy_reward/std": 0.4986393451690674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2093706578016281, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1042.341552734375, "completions/mean_terminated_length": 813.6575317382812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3705716264450482, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1241329890380246, "kl": 0.018646240234375, "learning_rate": 8.142053605081854e-07, "loss": 0.1473, "num_tokens": 1014060285.0, "reward": 1.5234376192092896, "reward_std": 0.38116154074668884, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.21293358504772186, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1047.7254638671875, "completions/mean_terminated_length": 771.2962646484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37078472111235417, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12626987222787797, "kl": 0.017822265625, "learning_rate": 8.139342951351415e-07, "loss": 0.0839, "num_tokens": 1014597410.0, "reward": 1.4787946939468384, "reward_std": 0.31804928183555603, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20568081736564636, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1035.421875, "completions/mean_terminated_length": 881.8432006835938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3709978157796601, "frac_reward_zero_std": 0.0, "grad_norm": 0.12744372520311079, "kl": 0.020355224609375, "learning_rate": 8.136630836946678e-07, "loss": 0.0897, "num_tokens": 1015127711.0, "reward": 1.4838169813156128, "reward_std": 0.3630126714706421, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.18937736749649048, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1036.828125, "completions/mean_terminated_length": 810.2813720703125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3712109104469661, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13076871779363503, "kl": 0.018035888671875, "learning_rate": 8.133917263368589e-07, "loss": 0.1002, "num_tokens": 1015654162.0, "reward": 1.5641741752624512, "reward_std": 0.325775146484375, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.1878882646560669, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 937.57373046875, "completions/mean_terminated_length": 785.3832397460938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.37142400511427204, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1272379782179284, "kl": 0.021240234375, "learning_rate": 8.131202232118904e-07, "loss": 0.0782, "num_tokens": 1016138051.0, "reward": 1.555803656578064, "reward_std": 0.3745437562465668, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19418221712112427, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1017.1808471679688, "completions/mean_terminated_length": 799.8729858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37163709978157794, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1263719184175237, "kl": 0.01885986328125, "learning_rate": 8.128485744700185e-07, "loss": 0.0727, "num_tokens": 1016659316.0, "reward": 1.3498884439468384, "reward_std": 0.2841840088367462, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18106688559055328, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1123.607177734375, "completions/mean_terminated_length": 864.7771606445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3718501944488839, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12534276295743713, "kl": 0.0172271728515625, "learning_rate": 8.125767802615799e-07, "loss": 0.0854, "num_tokens": 1017236628.0, "reward": 1.4363839626312256, "reward_std": 0.33974677324295044, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.16381210088729858, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1098.946533203125, "completions/mean_terminated_length": 801.1495361328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37206328911618985, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1336970587515128, "kl": 0.018218994140625, "learning_rate": 8.123048407369921e-07, "loss": 0.0671, "num_tokens": 1017796236.0, "reward": 1.3928571939468384, "reward_std": 0.3050023019313812, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1854480504989624, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1012.044677734375, "completions/mean_terminated_length": 816.9442749023438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3722763837834958, "frac_reward_zero_std": 0.0, "grad_norm": 0.13348417502947832, "kl": 0.020294189453125, "learning_rate": 8.120327560467526e-07, "loss": 0.0751, "num_tokens": 1018326224.0, "reward": 1.512834906578064, "reward_std": 0.3099156320095062, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1567346155643463, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1015.9531860351562, "completions/mean_terminated_length": 801.7546997070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.37248947845080177, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11863971452607153, "kl": 0.018096923828125, "learning_rate": 8.117605263414395e-07, "loss": 0.0569, "num_tokens": 1018847691.0, "reward": 1.4910714626312256, "reward_std": 0.30305105447769165, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1605055183172226, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1074.3973388671875, "completions/mean_terminated_length": 843.0994873046875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3727025731181077, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12955659592522656, "kl": 0.018402099609375, "learning_rate": 8.114881517717112e-07, "loss": 0.1153, "num_tokens": 1019398797.0, "reward": 1.4687501192092896, "reward_std": 0.32842057943344116, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.17017030715942383, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1083.2679443359375, "completions/mean_terminated_length": 809.6046142578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3729156677854137, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11362491590559314, "kl": 0.018585205078125, "learning_rate": 8.112156324883059e-07, "loss": 0.0587, "num_tokens": 1019950517.0, "reward": 1.430803656578064, "reward_std": 0.31503814458847046, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.15939722955226898, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 934.60498046875, "completions/mean_terminated_length": 735.3658447265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37312876245271964, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11995131213257423, "kl": 0.023345947265625, "learning_rate": 8.109429686420426e-07, "loss": 0.0503, "num_tokens": 1020435444.0, "reward": 1.5312501192092896, "reward_std": 0.2615625262260437, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1744592934846878, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1097.6942138671875, "completions/mean_terminated_length": 871.9309692382812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37334185712002554, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11842552045973728, "kl": 0.017669677734375, "learning_rate": 8.106701603838194e-07, "loss": 0.0911, "num_tokens": 1020996347.0, "reward": 1.505022406578064, "reward_std": 0.3941715359687805, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.1933954805135727, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1043.40185546875, "completions/mean_terminated_length": 831.6216430664062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3735549517873315, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12450296153295778, "kl": 0.0186767578125, "learning_rate": 8.103972078646154e-07, "loss": 0.0964, "num_tokens": 1021531039.0, "reward": 1.5094866752624512, "reward_std": 0.332633912563324, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17906324565410614, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1093.60498046875, "completions/mean_terminated_length": 815.8126831054688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.37376804645463746, "frac_reward_zero_std": 0.0, "grad_norm": 0.12825384476421609, "kl": 0.016571044921875, "learning_rate": 8.101241112354883e-07, "loss": 0.1036, "num_tokens": 1022092990.0, "reward": 1.5217634439468384, "reward_std": 0.296195387840271, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19803908467292786, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1103.544677734375, "completions/mean_terminated_length": 872.6777954101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3739811411219434, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12777015148978366, "kl": 0.01751708984375, "learning_rate": 8.098508706475765e-07, "loss": 0.1008, "num_tokens": 1022657666.0, "reward": 1.407366156578064, "reward_std": 0.29539307951927185, "rewards/accuracy_reward/mean": 0.4722222089767456, "rewards/accuracy_reward/std": 0.49980661273002625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.15574882924556732, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1117.899658203125, "completions/mean_terminated_length": 785.3181762695312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3741942357892494, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12685884231120742, "kl": 0.018280029296875, "learning_rate": 8.095774862520977e-07, "loss": 0.0921, "num_tokens": 1023236517.0, "reward": 1.5078126192092896, "reward_std": 0.3112878203392029, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.19175945222377777, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1050.962158203125, "completions/mean_terminated_length": 853.6871948242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37440733045655533, "frac_reward_zero_std": 0.0, "grad_norm": 0.13002677944413424, "kl": 0.018218994140625, "learning_rate": 8.093039582003491e-07, "loss": 0.1004, "num_tokens": 1023789252.0, "reward": 1.5837054252624512, "reward_std": 0.4018021523952484, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.16650304198265076, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1064.953125, "completions/mean_terminated_length": 803.9180908203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3746204251238613, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12625153920667695, "kl": 0.017120361328125, "learning_rate": 8.090302866437076e-07, "loss": 0.0547, "num_tokens": 1024341375.0, "reward": 1.493303656578064, "reward_std": 0.28441575169563293, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1484815776348114, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 981.8125610351562, "completions/mean_terminated_length": 810.5595703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.37483351979116725, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12246241365577787, "kl": 0.017913818359375, "learning_rate": 8.087564717336298e-07, "loss": 0.0649, "num_tokens": 1024847915.0, "reward": 1.5396206378936768, "reward_std": 0.3043603301048279, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.1386905312538147, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1034.609375, "completions/mean_terminated_length": 820.9757080078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3750466144584732, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12421330127733124, "kl": 0.018280029296875, "learning_rate": 8.084825136216509e-07, "loss": 0.0452, "num_tokens": 1025378604.0, "reward": 1.5111607313156128, "reward_std": 0.3387458026409149, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1799820363521576, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1133.828125, "completions/mean_terminated_length": 874.5072021484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3752597091257791, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12217365953217148, "kl": 0.0155487060546875, "learning_rate": 8.082084124593858e-07, "loss": 0.0812, "num_tokens": 1025974015.0, "reward": 1.3482143878936768, "reward_std": 0.3762272596359253, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17145662009716034, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1093.13623046875, "completions/mean_terminated_length": 800.8309326171875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.37547280379308506, "frac_reward_zero_std": 0.0, "grad_norm": 0.13513073747325874, "kl": 0.017974853515625, "learning_rate": 8.079341683985286e-07, "loss": 0.0554, "num_tokens": 1026535356.0, "reward": 1.2767857313156128, "reward_std": 0.35951194167137146, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20875635743141174, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1003.1808471679688, "completions/mean_terminated_length": 762.0687255859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.375685898460391, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1292557515257498, "kl": 0.01934814453125, "learning_rate": 8.076597815908526e-07, "loss": 0.0965, "num_tokens": 1027056573.0, "reward": 1.4804688692092896, "reward_std": 0.2993229627609253, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.15527822077274323, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1207.3817138671875, "completions/mean_terminated_length": 896.3272705078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.375898993127697, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11442972564283374, "kl": 0.0172576904296875, "learning_rate": 8.073852521882093e-07, "loss": 0.0516, "num_tokens": 1027675464.0, "reward": 1.3521206378936768, "reward_std": 0.2922036349773407, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1747300922870636, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1137.9107666015625, "completions/mean_terminated_length": 918.5816650390625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.37611208779500294, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1196506210088312, "kl": 0.0179443359375, "learning_rate": 8.071105803425302e-07, "loss": 0.0777, "num_tokens": 1028254464.0, "reward": 1.4157366752624512, "reward_std": 0.32972899079322815, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.18215985596179962, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1006.3928833007812, "completions/mean_terminated_length": 776.5013427734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3763251824623089, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12549337895206544, "kl": 0.01904296875, "learning_rate": 8.068357662058251e-07, "loss": 0.0687, "num_tokens": 1028769824.0, "reward": 1.4559152126312256, "reward_std": 0.28184282779693604, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.19548854231834412, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1048.625, "completions/mean_terminated_length": 837.9459838867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37653827712961485, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11980850805485095, "kl": 0.01800537109375, "learning_rate": 8.065608099301824e-07, "loss": 0.0789, "num_tokens": 1029313912.0, "reward": 1.5066964626312256, "reward_std": 0.36328262090682983, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18833374977111816, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1048.435302734375, "completions/mean_terminated_length": 869.5658569335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3767513717969208, "frac_reward_zero_std": 0.0, "grad_norm": 0.11918970749984055, "kl": 0.018707275390625, "learning_rate": 8.062857116677696e-07, "loss": 0.0511, "num_tokens": 1029850747.0, "reward": 1.6194196939468384, "reward_std": 0.39367207884788513, "rewards/accuracy_reward/mean": 0.6763392686843872, "rewards/accuracy_reward/std": 0.46839532256126404, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.16921022534370422, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 988.2098388671875, "completions/mean_terminated_length": 849.0454711914062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3769644664642267, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13006634861517039, "kl": 0.020416259765625, "learning_rate": 8.060104715708322e-07, "loss": 0.1084, "num_tokens": 1030349465.0, "reward": 1.5786831378936768, "reward_std": 0.383155882358551, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.1969708651304245, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1111.921875, "completions/mean_terminated_length": 839.4610595703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37717756113153267, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11957988756810889, "kl": 0.0169830322265625, "learning_rate": 8.057350897916948e-07, "loss": 0.0369, "num_tokens": 1030923846.0, "reward": 1.3532366752624512, "reward_std": 0.31791016459465027, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19697721302509308, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1052.7210693359375, "completions/mean_terminated_length": 823.041259765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3773906557988386, "frac_reward_zero_std": 0.0, "grad_norm": 0.13231535328403388, "kl": 0.019866943359375, "learning_rate": 8.0545956648276e-07, "loss": 0.135, "num_tokens": 1031464841.0, "reward": 1.4994419813156128, "reward_std": 0.41839471459388733, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19874386489391327, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1066.1763916015625, "completions/mean_terminated_length": 798.40625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3776037504661446, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1196844932436743, "kl": 0.017303466796875, "learning_rate": 8.051839017965091e-07, "loss": 0.1029, "num_tokens": 1032013064.0, "reward": 1.4910714626312256, "reward_std": 0.3079325556755066, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1675233542919159, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 951.716552734375, "completions/mean_terminated_length": 772.3246459960938, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.37781684513345054, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14280700810925132, "kl": 0.02117919921875, "learning_rate": 8.049080958855012e-07, "loss": 0.1123, "num_tokens": 1032508329.0, "reward": 1.6350446939468384, "reward_std": 0.3310209810733795, "rewards/accuracy_reward/mean": 0.6897321343421936, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17798370122909546, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1104.872802734375, "completions/mean_terminated_length": 864.4678344726562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3780299398007565, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11952240844526296, "kl": 0.0157928466796875, "learning_rate": 8.046321489023736e-07, "loss": 0.112, "num_tokens": 1033074560.0, "reward": 1.4235491752624512, "reward_std": 0.3836788535118103, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18981850147247314, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1031.58935546875, "completions/mean_terminated_length": 807.2588500976562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.37824303446806246, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13835621319224825, "kl": 0.01910400390625, "learning_rate": 8.04356060999842e-07, "loss": 0.0514, "num_tokens": 1033608920.0, "reward": 1.3861607313156128, "reward_std": 0.32100725173950195, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.19211392104625702, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1014.372802734375, "completions/mean_terminated_length": 819.7108764648438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3784561291353684, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13152905571988127, "kl": 0.020721435546875, "learning_rate": 8.040798323307e-07, "loss": 0.0717, "num_tokens": 1034128031.0, "reward": 1.5072544813156128, "reward_std": 0.26950329542160034, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16235284507274628, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1041.685302734375, "completions/mean_terminated_length": 816.2267456054688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3786692238026743, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11790811939762134, "kl": 0.017822265625, "learning_rate": 8.038034630478191e-07, "loss": 0.0842, "num_tokens": 1034671650.0, "reward": 1.4469866752624512, "reward_std": 0.36048850417137146, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.2086060494184494, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1081.8460693359375, "completions/mean_terminated_length": 835.5714721679688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3788823184699803, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12387912469036946, "kl": 0.017364501953125, "learning_rate": 8.035269533041483e-07, "loss": 0.0842, "num_tokens": 1035225437.0, "reward": 1.4564732313156128, "reward_std": 0.31012076139450073, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17213605344295502, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1108.6763916015625, "completions/mean_terminated_length": 806.6519165039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.37909541313728623, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13060316284638238, "kl": 0.018096923828125, "learning_rate": 8.032503032527148e-07, "loss": 0.088, "num_tokens": 1035789020.0, "reward": 1.407366156578064, "reward_std": 0.32878005504608154, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18340036273002625, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1092.6741943359375, "completions/mean_terminated_length": 868.9752197265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3793085078045922, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11450227013372614, "kl": 0.01593017578125, "learning_rate": 8.029735130466227e-07, "loss": 0.0938, "num_tokens": 1036344090.0, "reward": 1.383928656578064, "reward_std": 0.3981441259384155, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23556096851825714, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1003.5803833007812, "completions/mean_terminated_length": 803.5850830078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.37952160247189815, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1315771784202522, "kl": 0.0176239013671875, "learning_rate": 8.026965828390549e-07, "loss": 0.0913, "num_tokens": 1036856990.0, "reward": 1.5345982313156128, "reward_std": 0.34747010469436646, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17929750680923462, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1107.712158203125, "completions/mean_terminated_length": 834.0259399414062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3797346971392041, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11092434396023591, "kl": 0.0178680419921875, "learning_rate": 8.024195127832708e-07, "loss": 0.043, "num_tokens": 1037428749.0, "reward": 1.3727679252624512, "reward_std": 0.29710641503334045, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19927160441875458, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1082.07373046875, "completions/mean_terminated_length": 804.5086059570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.37994779180651006, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12035328090706476, "kl": 0.018280029296875, "learning_rate": 8.021423030326075e-07, "loss": 0.0778, "num_tokens": 1037981166.0, "reward": 1.3180804252624512, "reward_std": 0.3196057379245758, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.20377831161022186, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 997.05810546875, "completions/mean_terminated_length": 825.085693359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.380160886473816, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12269569712608709, "kl": 0.019775390625, "learning_rate": 8.018649537404791e-07, "loss": 0.0886, "num_tokens": 1038498120.0, "reward": 1.4754464626312256, "reward_std": 0.3356422185897827, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19280149042606354, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1070.7076416015625, "completions/mean_terminated_length": 811.2005615234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3803739811411219, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13722437240310628, "kl": 0.020111083984375, "learning_rate": 8.015874650603776e-07, "loss": 0.0765, "num_tokens": 1039048069.0, "reward": 1.477678656578064, "reward_std": 0.3419429659843445, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18898223340511322, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1088.716552734375, "completions/mean_terminated_length": 889.6199340820312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3805870758084279, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1276116399540212, "kl": 0.0164794921875, "learning_rate": 8.013098371458715e-07, "loss": 0.1002, "num_tokens": 1039612694.0, "reward": 1.3811384439468384, "reward_std": 0.333150714635849, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1885978728532791, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1125.1629638671875, "completions/mean_terminated_length": 893.164794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38080017047573383, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12871077717206791, "kl": 0.0165252685546875, "learning_rate": 8.010320701506067e-07, "loss": 0.1023, "num_tokens": 1040191887.0, "reward": 1.4654018878936768, "reward_std": 0.3638942837715149, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.1945772022008896, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 1018.04248046875, "completions/mean_terminated_length": 776.8677978515625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3810132651430398, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11907040379227167, "kl": 0.019622802734375, "learning_rate": 8.007541642283058e-07, "loss": 0.0506, "num_tokens": 1040719698.0, "reward": 1.4319196939468384, "reward_std": 0.3242812156677246, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16082797944545746, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1119.712158203125, "completions/mean_terminated_length": 817.6065063476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38122635981034575, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11750005033460884, "kl": 0.016510009765625, "learning_rate": 8.004761195327689e-07, "loss": 0.0897, "num_tokens": 1041285601.0, "reward": 1.4079241752624512, "reward_std": 0.3554452061653137, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1549724042415619, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 911.7076416015625, "completions/mean_terminated_length": 759.2430419921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.3814394544776517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13687215166950273, "kl": 0.02166748046875, "learning_rate": 8.001979362178718e-07, "loss": 0.0819, "num_tokens": 1041763710.0, "reward": 1.6082589626312256, "reward_std": 0.32452070713043213, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.15995624661445618, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1004.9152221679688, "completions/mean_terminated_length": 815.0132446289062, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.38165254914495766, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12917234766735372, "kl": 0.02081298828125, "learning_rate": 7.999196144375682e-07, "loss": 0.0693, "num_tokens": 1042286840.0, "reward": 1.5161831378936768, "reward_std": 0.37198200821876526, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18449558317661285, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1094.328125, "completions/mean_terminated_length": 851.2352905273438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3818656438122636, "frac_reward_zero_std": 0.0, "grad_norm": 0.13049530246507265, "kl": 0.01922607421875, "learning_rate": 7.996411543458876e-07, "loss": 0.0918, "num_tokens": 1042844075.0, "reward": 1.4056919813156128, "reward_std": 0.37510567903518677, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.21352127194404602, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1134.0223388671875, "completions/mean_terminated_length": 904.2513427734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3820787384795695, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13103210607016752, "kl": 0.018707275390625, "learning_rate": 7.993625560969366e-07, "loss": 0.0994, "num_tokens": 1043423461.0, "reward": 1.4017857313156128, "reward_std": 0.34620559215545654, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.19571910798549652, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1093.5067138671875, "completions/mean_terminated_length": 801.3148803710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3822918331468755, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12546049730504288, "kl": 0.018096923828125, "learning_rate": 7.990838198448979e-07, "loss": 0.0762, "num_tokens": 1043983560.0, "reward": 1.4068081378936768, "reward_std": 0.2768586277961731, "rewards/accuracy_reward/mean": 0.4791666567325592, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.18820028007030487, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 918.8482666015625, "completions/mean_terminated_length": 747.5886840820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38250492781418144, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1373295121488945, "kl": 0.022247314453125, "learning_rate": 7.988049457440306e-07, "loss": 0.0831, "num_tokens": 1044458660.0, "reward": 1.5507813692092896, "reward_std": 0.3210321366786957, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.20957329869270325, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 939.997802734375, "completions/mean_terminated_length": 762.0285034179688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3827180224814874, "frac_reward_zero_std": 0.0, "grad_norm": 0.1325885978852418, "kl": 0.02081298828125, "learning_rate": 7.985259339486701e-07, "loss": 0.0997, "num_tokens": 1044944867.0, "reward": 1.5842634439468384, "reward_std": 0.32810935378074646, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.1818443238735199, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 966.9933471679688, "completions/mean_terminated_length": 809.4041137695312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.38293111714879335, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12866016966276977, "kl": 0.017669677734375, "learning_rate": 7.982467846132285e-07, "loss": 0.0525, "num_tokens": 1045449760.0, "reward": 1.614397406578064, "reward_std": 0.3444443941116333, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.151633620262146, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 937.80810546875, "completions/mean_terminated_length": 785.6497192382812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3831442118160993, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12781260420368323, "kl": 0.019561767578125, "learning_rate": 7.97967497892193e-07, "loss": 0.0503, "num_tokens": 1045936122.0, "reward": 1.5563616752624512, "reward_std": 0.31537291407585144, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.16368533670902252, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 902.8995971679688, "completions/mean_terminated_length": 768.685791015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38335730648340527, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13514006746352134, "kl": 0.02056884765625, "learning_rate": 7.976880739401279e-07, "loss": 0.0772, "num_tokens": 1046409261.0, "reward": 1.6138393878936768, "reward_std": 0.28237566351890564, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378541469574, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.12561768293380737, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1051.321533203125, "completions/mean_terminated_length": 841.2108154296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3835704011507112, "frac_reward_zero_std": 0.0, "grad_norm": 0.144307215464622, "kl": 0.0185546875, "learning_rate": 7.974085129116726e-07, "loss": 0.09, "num_tokens": 1046952141.0, "reward": 1.5279018878936768, "reward_std": 0.32799944281578064, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.17489881813526154, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1081.53125, "completions/mean_terminated_length": 835.176513671875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.38378349581801713, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11958077567803473, "kl": 0.0184326171875, "learning_rate": 7.971288149615431e-07, "loss": 0.068, "num_tokens": 1047505259.0, "reward": 1.5212054252624512, "reward_std": 0.3050203025341034, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.15772415697574615, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1085.8348388671875, "completions/mean_terminated_length": 857.254150390625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3839965904853231, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1258723384720399, "kl": 0.019073486328125, "learning_rate": 7.968489802445305e-07, "loss": 0.0668, "num_tokens": 1048057009.0, "reward": 1.5117188692092896, "reward_std": 0.3104957342147827, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17591212689876556, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 893.15185546875, "completions/mean_terminated_length": 748.0703735351562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38420968515262904, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13237794076585396, "kl": 0.022216796875, "learning_rate": 7.965690089155022e-07, "loss": 0.1252, "num_tokens": 1048515141.0, "reward": 1.645647406578064, "reward_std": 0.24935492873191833, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137671947479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16321179270744324, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1075.966552734375, "completions/mean_terminated_length": 861.4304809570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.384422779819935, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1350488368669142, "kl": 0.018310546875, "learning_rate": 7.962889011294004e-07, "loss": 0.1106, "num_tokens": 1049074070.0, "reward": 1.4827009439468384, "reward_std": 0.3695772886276245, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.186005100607872, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1073.102783203125, "completions/mean_terminated_length": 848.1264038085938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.38463587448724096, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11183417831967345, "kl": 0.016937255859375, "learning_rate": 7.960086570412439e-07, "loss": 0.0665, "num_tokens": 1049626004.0, "reward": 1.4626116752624512, "reward_std": 0.33566537499427795, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.18208445608615875, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1207.359375, "completions/mean_terminated_length": 930.4718017578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3848489691545469, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10371128354773383, "kl": 0.0159149169921875, "learning_rate": 7.957282768061258e-07, "loss": 0.0673, "num_tokens": 1050236629.0, "reward": 1.4469866752624512, "reward_std": 0.32617658376693726, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19121423363685608, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 921.5982666015625, "completions/mean_terminated_length": 754.0820922851562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3850620638218529, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14412281616411, "kl": 0.018768310546875, "learning_rate": 7.954477605792157e-07, "loss": 0.0978, "num_tokens": 1050709985.0, "reward": 1.5407366752624512, "reward_std": 0.3019663989543915, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.16706722974777222, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1039.74560546875, "completions/mean_terminated_length": 807.0714721679688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.38527515848915883, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11914238745813858, "kl": 0.016998291015625, "learning_rate": 7.951671085157574e-07, "loss": 0.1028, "num_tokens": 1051245023.0, "reward": 1.32421875, "reward_std": 0.2977350652217865, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2039153277873993, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1148.82373046875, "completions/mean_terminated_length": 947.3688354492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38548825315646473, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11345135170374829, "kl": 0.016510009765625, "learning_rate": 7.948863207710704e-07, "loss": 0.0751, "num_tokens": 1051830848.0, "reward": 1.4447544813156128, "reward_std": 0.35639041662216187, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.18368858098983765, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 978.29248046875, "completions/mean_terminated_length": 840.8740234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3857013478237707, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12557491440026874, "kl": 0.017974853515625, "learning_rate": 7.946053975005494e-07, "loss": 0.0981, "num_tokens": 1052341459.0, "reward": 1.4592634439468384, "reward_std": 0.3712087571620941, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.16924987733364105, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 954.3683471679688, "completions/mean_terminated_length": 810.7600708007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38591444249107665, "frac_reward_zero_std": 0.0, "grad_norm": 0.14545143970999172, "kl": 0.020477294921875, "learning_rate": 7.943243388596638e-07, "loss": 0.0665, "num_tokens": 1052833640.0, "reward": 1.6060268878936768, "reward_std": 0.3298066258430481, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1707671582698822, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1110.19873046875, "completions/mean_terminated_length": 819.5350952148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3861275371583826, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12294192509071616, "kl": 0.017669677734375, "learning_rate": 7.940431450039581e-07, "loss": 0.052, "num_tokens": 1053405969.0, "reward": 1.3616071939468384, "reward_std": 0.34001073241233826, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17284893989562988, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1049.747802734375, "completions/mean_terminated_length": 822.7479858398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38634063182568856, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11562395466354608, "kl": 0.0179901123046875, "learning_rate": 7.937618160890516e-07, "loss": 0.0674, "num_tokens": 1053938992.0, "reward": 1.4860491752624512, "reward_std": 0.2867286801338196, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17932796478271484, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 973.4553833007812, "completions/mean_terminated_length": 777.8258666992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3865537264929945, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14547910544433923, "kl": 0.02227783203125, "learning_rate": 7.934803522706382e-07, "loss": 0.1225, "num_tokens": 1054445852.0, "reward": 1.3750001192092896, "reward_std": 0.3737272024154663, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 993.1160888671875, "completions/mean_terminated_length": 833.120849609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3867668211603005, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13114182615344394, "kl": 0.018829345703125, "learning_rate": 7.931987537044867e-07, "loss": 0.1055, "num_tokens": 1054961200.0, "reward": 1.5368304252624512, "reward_std": 0.3504135012626648, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18948033452033997, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 1064.921875, "completions/mean_terminated_length": 864.0779418945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38697991582760644, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11197691250029149, "kl": 0.017822265625, "learning_rate": 7.929170205464403e-07, "loss": 0.086, "num_tokens": 1055509261.0, "reward": 1.5915179252624512, "reward_std": 0.3448924124240875, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16981780529022217, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1127.169677734375, "completions/mean_terminated_length": 869.337158203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.38719301049491234, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12644331294454494, "kl": 0.016204833984375, "learning_rate": 7.926351529524166e-07, "loss": 0.09, "num_tokens": 1056081337.0, "reward": 1.4436384439468384, "reward_std": 0.4390977621078491, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20423343777656555, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 992.7835083007812, "completions/mean_terminated_length": 763.3886108398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3874061051622183, "frac_reward_zero_std": 0.0, "grad_norm": 0.14332686167695827, "kl": 0.018951416015625, "learning_rate": 7.923531510784081e-07, "loss": 0.1209, "num_tokens": 1056601720.0, "reward": 1.5172991752624512, "reward_std": 0.4139867126941681, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.22437745332717896, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1030.4442138671875, "completions/mean_terminated_length": 851.50390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38761919982952425, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11961041495857284, "kl": 0.017364501953125, "learning_rate": 7.920710150804809e-07, "loss": 0.101, "num_tokens": 1057132607.0, "reward": 1.5055804252624512, "reward_std": 0.34474942088127136, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.1887938678264618, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1146.13623046875, "completions/mean_terminated_length": 890.306640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3878322944968302, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12147003152750725, "kl": 0.0174560546875, "learning_rate": 7.917887451147758e-07, "loss": 0.0818, "num_tokens": 1057719036.0, "reward": 1.430803656578064, "reward_std": 0.376973420381546, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.22532203793525696, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 931.3058471679688, "completions/mean_terminated_length": 768.5140380859375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.38804538916413617, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13149656813654068, "kl": 0.021728515625, "learning_rate": 7.915063413375077e-07, "loss": 0.0757, "num_tokens": 1058203349.0, "reward": 1.5658482313156128, "reward_std": 0.3202120363712311, "rewards/accuracy_reward/mean": 0.6527777910232544, "rewards/accuracy_reward/std": 0.47663912177085876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.1781519651412964, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1005.2545166015625, "completions/mean_terminated_length": 818.657958984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3882584838314421, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13781638547097327, "kl": 0.01904296875, "learning_rate": 7.912238039049653e-07, "loss": 0.0593, "num_tokens": 1058719591.0, "reward": 1.5267857313156128, "reward_std": 0.2963954508304596, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.16070762276649475, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1020.0670166015625, "completions/mean_terminated_length": 829.708984375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3884715784987481, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12979604106868722, "kl": 0.02178955078125, "learning_rate": 7.909411329735117e-07, "loss": 0.0919, "num_tokens": 1059237957.0, "reward": 1.5585938692092896, "reward_std": 0.2846677303314209, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1705797165632248, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1189.044677734375, "completions/mean_terminated_length": 895.8682861328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.38868467316605404, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 3.1625034798004856, "kl": 0.237060546875, "learning_rate": 7.906583286995834e-07, "loss": 0.098, "num_tokens": 1059840489.0, "reward": 1.3465402126312256, "reward_std": 0.40076354146003723, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9090401530265808, "rewards/tag_count_reward/std": 0.24166594445705414, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1118.66748046875, "completions/mean_terminated_length": 878.5028076171875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.38889776783336, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12691727788630341, "kl": 0.019866943359375, "learning_rate": 7.90375391239691e-07, "loss": 0.0692, "num_tokens": 1060411828.0, "reward": 1.5892857313156128, "reward_std": 0.32817450165748596, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.1884397715330124, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1067.294677734375, "completions/mean_terminated_length": 863.7520141601562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3891108625006659, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12953045236246757, "kl": 0.0185546875, "learning_rate": 7.900923207504185e-07, "loss": 0.0745, "num_tokens": 1060959224.0, "reward": 1.4514509439468384, "reward_std": 0.34199219942092896, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.192632257938385, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1022.4464721679688, "completions/mean_terminated_length": 816.2359619140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.38932395716797186, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11905626036850288, "kl": 0.01715087890625, "learning_rate": 7.898091173884243e-07, "loss": 0.0669, "num_tokens": 1061487712.0, "reward": 1.4687501192092896, "reward_std": 0.31353992223739624, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1628681719303131, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1008.7745971679688, "completions/mean_terminated_length": 819.5752563476562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3895370518352778, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12384514271152347, "kl": 0.017974853515625, "learning_rate": 7.895257813104393e-07, "loss": 0.0826, "num_tokens": 1062014379.0, "reward": 1.532366156578064, "reward_std": 0.33962759375572205, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19537115097045898, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1019.7388916015625, "completions/mean_terminated_length": 799.5962524414062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.38975014650258377, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1301693090603972, "kl": 0.018463134765625, "learning_rate": 7.892423126732684e-07, "loss": 0.0799, "num_tokens": 1062537878.0, "reward": 1.567522406578064, "reward_std": 0.35119324922561646, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.2017921358346939, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1026.640625, "completions/mean_terminated_length": 827.8159790039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.38996324116988973, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12255415322023203, "kl": 0.018402099609375, "learning_rate": 7.8895871163379e-07, "loss": 0.0947, "num_tokens": 1063075653.0, "reward": 1.547991156578064, "reward_std": 0.3486122190952301, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16169501841068268, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1092.96435546875, "completions/mean_terminated_length": 866.077392578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3901763358371957, "frac_reward_zero_std": 0.0, "grad_norm": 0.11795780774285654, "kl": 0.018829345703125, "learning_rate": 7.886749783489555e-07, "loss": 0.0279, "num_tokens": 1063639957.0, "reward": 1.4330357313156128, "reward_std": 0.32432830333709717, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.17770643532276154, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1048.759033203125, "completions/mean_terminated_length": 879.1749267578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.39038943050450164, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2065606739080652, "kl": 0.0185546875, "learning_rate": 7.883911129757894e-07, "loss": 0.0537, "num_tokens": 1064179625.0, "reward": 1.4414063692092896, "reward_std": 0.2965337038040161, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.16610048711299896, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1161.290283203125, "completions/mean_terminated_length": 872.7160034179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3906025251718076, "frac_reward_zero_std": 0.0, "grad_norm": 0.11368656188594062, "kl": 0.016510009765625, "learning_rate": 7.881071156713893e-07, "loss": 0.0721, "num_tokens": 1064770507.0, "reward": 1.3984376192092896, "reward_std": 0.36735406517982483, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.2030172199010849, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1142.77685546875, "completions/mean_terminated_length": 899.1614379882812, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.3908156198391135, "frac_reward_zero_std": 0.0, "grad_norm": 0.1218887838560854, "kl": 0.0152587890625, "learning_rate": 7.878229865929266e-07, "loss": 0.0943, "num_tokens": 1065359431.0, "reward": 1.4938616752624512, "reward_std": 0.39353471994400024, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.21221931278705597, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1114.7879638671875, "completions/mean_terminated_length": 870.3126220703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.39102871450641946, "frac_reward_zero_std": 0.0, "grad_norm": 0.12822419870253496, "kl": 0.016754150390625, "learning_rate": 7.875387258976444e-07, "loss": 0.0402, "num_tokens": 1065938504.0, "reward": 1.4280134439468384, "reward_std": 0.33710843324661255, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18714261054992676, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1126.5179443359375, "completions/mean_terminated_length": 861.72412109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3912418091737254, "frac_reward_zero_std": 0.0, "grad_norm": 0.12144290200081527, "kl": 0.017669677734375, "learning_rate": 7.872543337428595e-07, "loss": 0.0272, "num_tokens": 1066512800.0, "reward": 1.434709906578064, "reward_std": 0.34116995334625244, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.18177567422389984, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1130.0826416015625, "completions/mean_terminated_length": 924.428955078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3914549038410314, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12528453007055942, "kl": 0.017669677734375, "learning_rate": 7.869698102859612e-07, "loss": 0.0962, "num_tokens": 1067089509.0, "reward": 1.485491156578064, "reward_std": 0.344857782125473, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1880783587694168, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1053.1629638671875, "completions/mean_terminated_length": 833.593994140625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.39166799850833733, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12110837430607493, "kl": 0.0167388916015625, "learning_rate": 7.866851556844115e-07, "loss": 0.0843, "num_tokens": 1067629150.0, "reward": 1.3571429252624512, "reward_std": 0.37157145142555237, "rewards/accuracy_reward/mean": 0.44675925374031067, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.20843316614627838, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1221.9129638671875, "completions/mean_terminated_length": 887.8526611328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3918810931756433, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1210614574211017, "kl": 0.0151214599609375, "learning_rate": 7.864003700957447e-07, "loss": 0.0718, "num_tokens": 1068256951.0, "reward": 1.25, "reward_std": 0.3642135560512543, "rewards/accuracy_reward/mean": 0.35648149251937866, "rewards/accuracy_reward/std": 0.47951504588127136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23560336232185364, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1035.671875, "completions/mean_terminated_length": 832.1206665039062, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.39209418784294925, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13549143144958467, "kl": 0.017303466796875, "learning_rate": 7.861154536775679e-07, "loss": 0.0923, "num_tokens": 1068780980.0, "reward": 1.383928656578064, "reward_std": 0.38585197925567627, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9129464030265808, "rewards/tag_count_reward/std": 0.22857847809791565, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1128.9263916015625, "completions/mean_terminated_length": 894.6527099609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3923072825102552, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1272736979876836, "kl": 0.0174560546875, "learning_rate": 7.858304065875607e-07, "loss": 0.0606, "num_tokens": 1069357331.0, "reward": 1.3727679252624512, "reward_std": 0.31531792879104614, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20194751024246216, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1126.9554443359375, "completions/mean_terminated_length": 858.8703002929688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3925203771775611, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.35038197534731513, "kl": 0.017913818359375, "learning_rate": 7.855452289834746e-07, "loss": 0.0889, "num_tokens": 1069937807.0, "reward": 1.383928656578064, "reward_std": 0.3546689748764038, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23615379631519318, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1217.5848388671875, "completions/mean_terminated_length": 896.2166748046875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.39273347184486707, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11493389150541113, "kl": 0.015777587890625, "learning_rate": 7.852599210231339e-07, "loss": 0.088, "num_tokens": 1070554629.0, "reward": 1.4118304252624512, "reward_std": 0.3486160635948181, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.21787147223949432, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1047.53125, "completions/mean_terminated_length": 849.5775756835938, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.392946566512173, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12421459496491848, "kl": 0.0166473388671875, "learning_rate": 7.849744828644344e-07, "loss": 0.094, "num_tokens": 1071090563.0, "reward": 1.5351563692092896, "reward_std": 0.381756991147995, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19872502982616425, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1018.435302734375, "completions/mean_terminated_length": 784.3150634765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.393159661179479, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14857553154008735, "kl": 0.02093505859375, "learning_rate": 7.846889146653445e-07, "loss": 0.1141, "num_tokens": 1071614678.0, "reward": 1.4743304252624512, "reward_std": 0.382669061422348, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.2122759371995926, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1133.5223388671875, "completions/mean_terminated_length": 890.6949462890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.39337275584678494, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14152765966101366, "kl": 0.019744873046875, "learning_rate": 7.84403216583904e-07, "loss": 0.0681, "num_tokens": 1072197408.0, "reward": 1.4324777126312256, "reward_std": 0.35994189977645874, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.2239653617143631, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 993.9464721679688, "completions/mean_terminated_length": 743.5359497070312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3935858505140909, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12327990144160939, "kl": 0.02008056640625, "learning_rate": 7.841173887782253e-07, "loss": 0.1227, "num_tokens": 1072716520.0, "reward": 1.5580357313156128, "reward_std": 0.31139156222343445, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.194285050034523, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1206.305908203125, "completions/mean_terminated_length": 919.02099609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.39379894518139685, "frac_reward_zero_std": 0.0, "grad_norm": 0.18729405839003072, "kl": 0.014617919921875, "learning_rate": 7.838314314064922e-07, "loss": 0.0849, "num_tokens": 1073330929.0, "reward": 1.4648438692092896, "reward_std": 0.39464667439460754, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.21064873039722443, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1034.259033203125, "completions/mean_terminated_length": 810.5177001953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3940120398487028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1259947520500176, "kl": 0.01678466796875, "learning_rate": 7.8354534462696e-07, "loss": 0.0736, "num_tokens": 1073860453.0, "reward": 1.4955357313156128, "reward_std": 0.31719037890434265, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.18313127756118774, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1131.0982666015625, "completions/mean_terminated_length": 867.6206665039062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3942251345160087, "frac_reward_zero_std": 0.0, "grad_norm": 0.12491957119243292, "kl": 0.016632080078125, "learning_rate": 7.832591285979559e-07, "loss": 0.0607, "num_tokens": 1074436177.0, "reward": 1.3828126192092896, "reward_std": 0.35941779613494873, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20094045996665955, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1162.1116943359375, "completions/mean_terminated_length": 866.8154907226562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.39443822918331467, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13605322173728168, "kl": 0.018218994140625, "learning_rate": 7.829727834778786e-07, "loss": 0.0502, "num_tokens": 1075033779.0, "reward": 1.3610491752624512, "reward_std": 0.26947885751724243, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.18787497282028198, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 989.1317138671875, "completions/mean_terminated_length": 772.8037719726562, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.39465132385062063, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13539899688717502, "kl": 0.019287109375, "learning_rate": 7.82686309425198e-07, "loss": 0.1039, "num_tokens": 1075545582.0, "reward": 1.4732143878936768, "reward_std": 0.34325820207595825, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19134558737277985, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1127.138427734375, "completions/mean_terminated_length": 834.6294555664062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3948644185179266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1155484871912658, "kl": 0.0164337158203125, "learning_rate": 7.82399706598456e-07, "loss": 0.0934, "num_tokens": 1076113468.0, "reward": 1.4391741752624512, "reward_std": 0.33238133788108826, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19942738115787506, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 951.9754638671875, "completions/mean_terminated_length": 749.0079345703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.39507751318523254, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14515584977915927, "kl": 0.01806640625, "learning_rate": 7.82112975156265e-07, "loss": 0.0801, "num_tokens": 1076611953.0, "reward": 1.5106027126312256, "reward_std": 0.2995833158493042, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1885978728532791, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 986.513427734375, "completions/mean_terminated_length": 822.365966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3952906078525385, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13106390039802518, "kl": 0.019287109375, "learning_rate": 7.81826115257309e-07, "loss": 0.1322, "num_tokens": 1077117431.0, "reward": 1.5998884439468384, "reward_std": 0.3600275218486786, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.19280068576335907, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1068.0670166015625, "completions/mean_terminated_length": 831.9058227539062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.39550370251984446, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12260616543191685, "kl": 0.0161895751953125, "learning_rate": 7.81539127060343e-07, "loss": 0.0843, "num_tokens": 1077668917.0, "reward": 1.4012277126312256, "reward_std": 0.33150237798690796, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19662198424339294, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 930.7879638671875, "completions/mean_terminated_length": 741.1828002929688, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.3957167971871504, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13830920474611677, "kl": 0.020294189453125, "learning_rate": 7.812520107241929e-07, "loss": 0.094, "num_tokens": 1078150134.0, "reward": 1.5485491752624512, "reward_std": 0.3066059947013855, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.16355563700199127, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1094.1607666015625, "completions/mean_terminated_length": 854.3687133789062, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.3959298918544563, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13671912048210633, "kl": 0.0186767578125, "learning_rate": 7.809647664077557e-07, "loss": 0.1144, "num_tokens": 1078700510.0, "reward": 1.3945313692092896, "reward_std": 0.32575950026512146, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.21163024008274078, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1004.747802734375, "completions/mean_terminated_length": 814.8153076171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3961429865217623, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.23542843650774636, "kl": 0.01959228515625, "learning_rate": 7.806773942699992e-07, "loss": 0.0776, "num_tokens": 1079226429.0, "reward": 1.5033482313156128, "reward_std": 0.3194870948791504, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.1717585176229477, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 983.3460083007812, "completions/mean_terminated_length": 786.1878051757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.39635608118906823, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13045063785346978, "kl": 0.01837158203125, "learning_rate": 7.803898944699619e-07, "loss": 0.1571, "num_tokens": 1079732568.0, "reward": 1.4888393878936768, "reward_std": 0.35875171422958374, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18981274962425232, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1037.3773193359375, "completions/mean_terminated_length": 804.1566162109375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3965691758563742, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14387530623122705, "kl": 0.0184326171875, "learning_rate": 7.801022671667528e-07, "loss": 0.0643, "num_tokens": 1080265441.0, "reward": 1.4983259439468384, "reward_std": 0.32769328355789185, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19555239379405975, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 971.857177734375, "completions/mean_terminated_length": 775.9367065429688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.39678227052368015, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1368111486404299, "kl": 0.019744873046875, "learning_rate": 7.798145125195515e-07, "loss": 0.0495, "num_tokens": 1080766849.0, "reward": 1.4921876192092896, "reward_std": 0.2906310558319092, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19537116587162018, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1109.904052734375, "completions/mean_terminated_length": 833.35546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3969953651909861, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12390137295764388, "kl": 0.017791748046875, "learning_rate": 7.795266306876084e-07, "loss": 0.1167, "num_tokens": 1081329734.0, "reward": 1.4475446939468384, "reward_std": 0.32965415716171265, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17320609092712402, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1075.1629638671875, "completions/mean_terminated_length": 820.3070068359375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.39720845985829206, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12405760210112007, "kl": 0.0166015625, "learning_rate": 7.79238621830244e-07, "loss": 0.0873, "num_tokens": 1081880607.0, "reward": 1.465959906578064, "reward_std": 0.3064712882041931, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17189201712608337, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1096.919677734375, "completions/mean_terminated_length": 867.7119140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.397421554525598, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11499181809144696, "kl": 0.017364501953125, "learning_rate": 7.789504861068492e-07, "loss": 0.0899, "num_tokens": 1082443883.0, "reward": 1.5412946939468384, "reward_std": 0.30650460720062256, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.18805180490016937, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1171.3326416015625, "completions/mean_terminated_length": 850.6005859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3976346491929039, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1130525972229058, "kl": 0.016204833984375, "learning_rate": 7.786622236768849e-07, "loss": 0.0612, "num_tokens": 1083033760.0, "reward": 1.4525669813156128, "reward_std": 0.3609563410282135, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.2174951434135437, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 992.6920166015625, "completions/mean_terminated_length": 777.0913696289062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3978477438602099, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14285254291358337, "kl": 0.02239990234375, "learning_rate": 7.783738346998825e-07, "loss": 0.0879, "num_tokens": 1083550230.0, "reward": 1.5189732313156128, "reward_std": 0.30908212065696716, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.2057536393404007, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 917.1183471679688, "completions/mean_terminated_length": 748.9359130859375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.39806083852751584, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1283459036976161, "kl": 0.02227783203125, "learning_rate": 7.780853193354431e-07, "loss": 0.0685, "num_tokens": 1084027579.0, "reward": 1.5033482313156128, "reward_std": 0.34642207622528076, "rewards/accuracy_reward/mean": 0.5817307829856873, "rewards/accuracy_reward/std": 0.49386879801750183, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.13496457040309906, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1048.6473388671875, "completions/mean_terminated_length": 814.63916015625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.3982739331948218, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11832895675423899, "kl": 0.017364501953125, "learning_rate": 7.777966777432379e-07, "loss": 0.0695, "num_tokens": 1084567341.0, "reward": 1.5172991752624512, "reward_std": 0.29939672350883484, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19084173440933228, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1030.6875, "completions/mean_terminated_length": 832.650634765625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.39848702786212775, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1290271125551043, "kl": 0.019989013671875, "learning_rate": 7.775079100830078e-07, "loss": 0.0513, "num_tokens": 1085092993.0, "reward": 1.4547991752624512, "reward_std": 0.27645638585090637, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19944614171981812, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1022.8928833007812, "completions/mean_terminated_length": 803.4254760742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3987001225294337, "frac_reward_zero_std": 0.0, "grad_norm": 0.13860128657505907, "kl": 0.017364501953125, "learning_rate": 7.772190165145638e-07, "loss": 0.0616, "num_tokens": 1085617569.0, "reward": 1.4927456378936768, "reward_std": 0.37611469626426697, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.16943417489528656, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1092.165283203125, "completions/mean_terminated_length": 887.5285034179688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.39891321719673967, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12031813342865773, "kl": 0.01739501953125, "learning_rate": 7.769299971977864e-07, "loss": 0.0614, "num_tokens": 1086178315.0, "reward": 1.5094866752624512, "reward_std": 0.3737342655658722, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.2104293406009674, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1002.1116333007812, "completions/mean_terminated_length": 843.480712890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3991263118640456, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12344596734662744, "kl": 0.017730712890625, "learning_rate": 7.766408522926254e-07, "loss": 0.1013, "num_tokens": 1086688589.0, "reward": 1.571428656578064, "reward_std": 0.3135511577129364, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1613743007183075, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1017.560302734375, "completions/mean_terminated_length": 758.5111694335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3993394065313515, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14424721796527923, "kl": 0.020660400390625, "learning_rate": 7.763515819591006e-07, "loss": 0.0896, "num_tokens": 1087211928.0, "reward": 1.5251116752624512, "reward_std": 0.3216903805732727, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.20596210658550262, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 977.075927734375, "completions/mean_terminated_length": 814.6478271484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3995525011986575, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13209523098902112, "kl": 0.019683837890625, "learning_rate": 7.760621863573009e-07, "loss": 0.0777, "num_tokens": 1087717994.0, "reward": 1.540178656578064, "reward_std": 0.33858421444892883, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.16601119935512543, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 982.8594360351562, "completions/mean_terminated_length": 772.109619140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.39976559586596344, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12241846282892788, "kl": 0.018157958984375, "learning_rate": 7.757726656473846e-07, "loss": 0.0988, "num_tokens": 1088223739.0, "reward": 1.5530134439468384, "reward_std": 0.30941325426101685, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1944383978843689, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1097.419677734375, "completions/mean_terminated_length": 834.7236328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3999786905332694, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11712741179211929, "kl": 0.0165252685546875, "learning_rate": 7.754830199895793e-07, "loss": 0.0794, "num_tokens": 1088787607.0, "reward": 1.3789063692092896, "reward_std": 0.34759923815727234, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22595205903053284, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 942.6116333007812, "completions/mean_terminated_length": 778.2205200195312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.40019178520057536, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11591270389671879, "kl": 0.019775390625, "learning_rate": 7.751932495441818e-07, "loss": 0.072, "num_tokens": 1089277593.0, "reward": 1.5585938692092896, "reward_std": 0.32834160327911377, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19267114996910095, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1050.2210693359375, "completions/mean_terminated_length": 813.1795654296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4004048798678813, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13845436320110055, "kl": 0.016998291015625, "learning_rate": 7.749033544715576e-07, "loss": 0.111, "num_tokens": 1089820652.0, "reward": 1.4603794813156128, "reward_std": 0.3966957628726959, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.2092752456665039, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1063.700927734375, "completions/mean_terminated_length": 826.4874877929688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.40061797453518727, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12853013638243657, "kl": 0.0180511474609375, "learning_rate": 7.746133349321416e-07, "loss": 0.0622, "num_tokens": 1090374870.0, "reward": 1.4559152126312256, "reward_std": 0.3015094995498657, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20909620821475983, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1062.618408203125, "completions/mean_terminated_length": 821.7472534179688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.40083106920249323, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1328552826149423, "kl": 0.019012451171875, "learning_rate": 7.743231910864376e-07, "loss": 0.0806, "num_tokens": 1090922747.0, "reward": 1.5039063692092896, "reward_std": 0.2862297296524048, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18560869991779327, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1038.857177734375, "completions/mean_terminated_length": 861.3963012695312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.40104416386979913, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12812715566365404, "kl": 0.018402099609375, "learning_rate": 7.740329230950175e-07, "loss": 0.1159, "num_tokens": 1091457659.0, "reward": 1.4302456378936768, "reward_std": 0.334820419549942, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1833348423242569, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1086.685302734375, "completions/mean_terminated_length": 880.8753662109375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.4012572585371051, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11487268419937838, "kl": 0.0164031982421875, "learning_rate": 7.737425311185229e-07, "loss": 0.0655, "num_tokens": 1092012654.0, "reward": 1.465959906578064, "reward_std": 0.3478304445743561, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.16876234114170074, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1025.4241943359375, "completions/mean_terminated_length": 792.8931884765625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.40147035320441105, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13249806583037582, "kl": 0.01959228515625, "learning_rate": 7.734520153176635e-07, "loss": 0.045, "num_tokens": 1092536748.0, "reward": 1.520647406578064, "reward_std": 0.3310166001319885, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.1667456179857254, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1179.7098388671875, "completions/mean_terminated_length": 907.255126953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.401683447871717, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1190475966080052, "kl": 0.017578125, "learning_rate": 7.731613758532173e-07, "loss": 0.114, "num_tokens": 1093134602.0, "reward": 1.4575893878936768, "reward_std": 0.359716534614563, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.20299570262432098, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1116.165283203125, "completions/mean_terminated_length": 881.905029296875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.40189654253902296, "frac_reward_zero_std": 0.0, "grad_norm": 0.12008452794924968, "kl": 0.0174560546875, "learning_rate": 7.728706128860309e-07, "loss": 0.0784, "num_tokens": 1093705172.0, "reward": 1.4090402126312256, "reward_std": 0.3371075987815857, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.1933954805135727, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1038.13623046875, "completions/mean_terminated_length": 847.9495849609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4021096372063289, "frac_reward_zero_std": 0.0, "grad_norm": 0.17287242026321614, "kl": 0.01898193359375, "learning_rate": 7.725797265770199e-07, "loss": 0.1004, "num_tokens": 1094241873.0, "reward": 1.4375001192092896, "reward_std": 0.3682398200035095, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19212691485881805, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1085.3192138671875, "completions/mean_terminated_length": 812.2378540039062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4023227318736349, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12364401603077671, "kl": 0.019561767578125, "learning_rate": 7.722887170871669e-07, "loss": 0.0605, "num_tokens": 1094798512.0, "reward": 1.4676339626312256, "reward_std": 0.2902711033821106, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.175497367978096, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 964.6473388671875, "completions/mean_terminated_length": 816.16748046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40253582654094083, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12486356765018565, "kl": 0.01806640625, "learning_rate": 7.719975845775241e-07, "loss": 0.077, "num_tokens": 1095293490.0, "reward": 1.5273438692092896, "reward_std": 0.34471288323402405, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.1910770684480667, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1130.9910888671875, "completions/mean_terminated_length": 913.13818359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4027489212082468, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1231492638390904, "kl": 0.01641845703125, "learning_rate": 7.717063292092104e-07, "loss": 0.0982, "num_tokens": 1095869678.0, "reward": 1.520647406578064, "reward_std": 0.3620290458202362, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.201184943318367, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1038.04248046875, "completions/mean_terminated_length": 791.1638793945312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4029620158755527, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12213762455203987, "kl": 0.0181121826171875, "learning_rate": 7.71414951143414e-07, "loss": 0.0623, "num_tokens": 1096412961.0, "reward": 1.5094866752624512, "reward_std": 0.24504512548446655, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.16474206745624542, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 965.1897583007812, "completions/mean_terminated_length": 768.055419921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40317511054285865, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12053049656910904, "kl": 0.020294189453125, "learning_rate": 7.711234505413896e-07, "loss": 0.0646, "num_tokens": 1096910502.0, "reward": 1.5161831378936768, "reward_std": 0.3053971827030182, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17351123690605164, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1061.950927734375, "completions/mean_terminated_length": 817.4985961914062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4033882052101646, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11493617999075069, "kl": 0.0181884765625, "learning_rate": 7.708318275644612e-07, "loss": 0.0908, "num_tokens": 1097459744.0, "reward": 1.450334906578064, "reward_std": 0.3695110082626343, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.24555550515651703, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1113.3482666015625, "completions/mean_terminated_length": 861.8130493164062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40360129987747057, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 5.8237167183663345, "kl": 0.294677734375, "learning_rate": 7.705400823740194e-07, "loss": 0.0736, "num_tokens": 1098044252.0, "reward": 1.4285714626312256, "reward_std": 0.383600652217865, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.22875317931175232, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1049.53125, "completions/mean_terminated_length": 819.1154174804688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4038143945447765, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12328979938218865, "kl": 0.01873779296875, "learning_rate": 7.702482151315229e-07, "loss": 0.0841, "num_tokens": 1098580426.0, "reward": 1.4497768878936768, "reward_std": 0.2986149191856384, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17798370122909546, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 983.19873046875, "completions/mean_terminated_length": 812.1683959960938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4040274892120825, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12273372867004068, "kl": 0.019805908203125, "learning_rate": 7.699562259984979e-07, "loss": 0.0749, "num_tokens": 1099091347.0, "reward": 1.6110491752624512, "reward_std": 0.31121882796287537, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17706511914730072, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1118.154052734375, "completions/mean_terminated_length": 881.1344604492188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.40424058387938844, "frac_reward_zero_std": 0.0, "grad_norm": 0.12539019288658815, "kl": 0.0167083740234375, "learning_rate": 7.696641151365379e-07, "loss": 0.0775, "num_tokens": 1099660872.0, "reward": 1.3431919813156128, "reward_std": 0.34335654973983765, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17828068137168884, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1071.796875, "completions/mean_terminated_length": 843.2093505859375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4044536785466944, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11245772545518806, "kl": 0.018035888671875, "learning_rate": 7.693718827073042e-07, "loss": 0.0446, "num_tokens": 1100209725.0, "reward": 1.4135044813156128, "reward_std": 0.34260883927345276, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.16131143271923065, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1123.669677734375, "completions/mean_terminated_length": 851.17919921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4046667732140003, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1386534029995427, "kl": 0.017730712890625, "learning_rate": 7.690795288725247e-07, "loss": 0.0865, "num_tokens": 1100789625.0, "reward": 1.4944196939468384, "reward_std": 0.3147575855255127, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.19750650227069855, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1095.6138916015625, "completions/mean_terminated_length": 875.8324584960938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.40487986788130625, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1410802082284113, "kl": 0.0186767578125, "learning_rate": 7.687870537939953e-07, "loss": 0.0668, "num_tokens": 1101351628.0, "reward": 1.4375001192092896, "reward_std": 0.3177659511566162, "rewards/accuracy_reward/mean": 0.5393518805503845, "rewards/accuracy_reward/std": 0.49902692437171936, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9174107313156128, "rewards/tag_count_reward/std": 0.2240774929523468, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 946.5714721679688, "completions/mean_terminated_length": 792.427490234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4050929625486122, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12392036962717298, "kl": 0.018524169921875, "learning_rate": 7.684944576335781e-07, "loss": 0.0998, "num_tokens": 1101844044.0, "reward": 1.5463169813156128, "reward_std": 0.2869468629360199, "rewards/accuracy_reward/mean": 0.6087962985038757, "rewards/accuracy_reward/std": 0.4885856807231903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15647153556346893, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 917.83935546875, "completions/mean_terminated_length": 788.5173950195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40530605721591817, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1178605409478237, "kl": 0.01971435546875, "learning_rate": 7.682017405532032e-07, "loss": 0.0511, "num_tokens": 1102314564.0, "reward": 1.5747768878936768, "reward_std": 0.2550079822540283, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.166563019156456, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1052.9375, "completions/mean_terminated_length": 830.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4055191518832241, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13026846576980652, "kl": 0.019622802734375, "learning_rate": 7.679089027148668e-07, "loss": 0.0778, "num_tokens": 1102853192.0, "reward": 1.4654018878936768, "reward_std": 0.2923460006713867, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19537116587162018, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1042.513427734375, "completions/mean_terminated_length": 843.5668334960938, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4057322465505301, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11842871270811793, "kl": 0.019256591796875, "learning_rate": 7.676159442806321e-07, "loss": 0.0434, "num_tokens": 1103388478.0, "reward": 1.532366156578064, "reward_std": 0.3286937177181244, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14629201591014862, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1080.46435546875, "completions/mean_terminated_length": 837.22900390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40594534121783604, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11699158131722365, "kl": 0.0172119140625, "learning_rate": 7.673228654126292e-07, "loss": 0.083, "num_tokens": 1103935502.0, "reward": 1.446428656578064, "reward_std": 0.3618532121181488, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20673725008964539, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1024.890625, "completions/mean_terminated_length": 844.9737548828125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.406158435885142, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11684507860441362, "kl": 0.019012451171875, "learning_rate": 7.670296662730552e-07, "loss": 0.0661, "num_tokens": 1104465485.0, "reward": 1.5212054252624512, "reward_std": 0.33779019117355347, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16402533650398254, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1201.3348388671875, "completions/mean_terminated_length": 895.09423828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4063715305524479, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11970346142741332, "kl": 0.0160064697265625, "learning_rate": 7.66736347024173e-07, "loss": 0.0853, "num_tokens": 1105079075.0, "reward": 1.3353794813156128, "reward_std": 0.3123358488082886, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.15823286771774292, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 978.6138916015625, "completions/mean_terminated_length": 838.1893920898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40658462521975386, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11721629706218116, "kl": 0.017791748046875, "learning_rate": 7.664429078283127e-07, "loss": 0.0412, "num_tokens": 1105587894.0, "reward": 1.4843751192092896, "reward_std": 0.2872348725795746, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1342271864414215, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1116.118408203125, "completions/mean_terminated_length": 868.6694946289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4067977198870598, "frac_reward_zero_std": 0.0, "grad_norm": 0.12476349915574803, "kl": 0.0177001953125, "learning_rate": 7.6614934884787e-07, "loss": 0.0751, "num_tokens": 1106160875.0, "reward": 1.3978794813156128, "reward_std": 0.37329185009002686, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2233457714319229, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1034.493408203125, "completions/mean_terminated_length": 800.607177734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.4070108145543658, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 1.6015496574119263, "kl": 0.06658935546875, "learning_rate": 7.658556702453075e-07, "loss": 0.0797, "num_tokens": 1106700488.0, "reward": 1.5764509439468384, "reward_std": 0.2860429286956787, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18338249623775482, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 893.4397583007812, "completions/mean_terminated_length": 735.2005004882812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.40722390922167173, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14125542459461077, "kl": 0.02130126953125, "learning_rate": 7.655618721831538e-07, "loss": 0.0745, "num_tokens": 1107166077.0, "reward": 1.6250001192092896, "reward_std": 0.27611470222473145, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15008719265460968, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1029.94873046875, "completions/mean_terminated_length": 835.0026245117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4074370038889777, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1212086610378566, "kl": 0.018218994140625, "learning_rate": 7.652679548240038e-07, "loss": 0.0512, "num_tokens": 1107697654.0, "reward": 1.5920759439468384, "reward_std": 0.3181268274784088, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.15465790033340454, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1011.9152221679688, "completions/mean_terminated_length": 826.5105590820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40765009855628365, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1576189361046836, "kl": 0.0196533203125, "learning_rate": 7.649739183305183e-07, "loss": 0.0849, "num_tokens": 1108217472.0, "reward": 1.4983259439468384, "reward_std": 0.2966291010379791, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16827340424060822, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1028.84375, "completions/mean_terminated_length": 810.6504516601562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4078631932235896, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12359925338967898, "kl": 0.019989013671875, "learning_rate": 7.646797628654236e-07, "loss": 0.072, "num_tokens": 1108745146.0, "reward": 1.4665179252624512, "reward_std": 0.3006556034088135, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.19207490980625153, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 992.872802734375, "completions/mean_terminated_length": 804.060546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4080762878908955, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1183473827657037, "kl": 0.020111083984375, "learning_rate": 7.643854885915128e-07, "loss": 0.0634, "num_tokens": 1109257649.0, "reward": 1.4598214626312256, "reward_std": 0.3134208619594574, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.17832356691360474, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1189.3192138671875, "completions/mean_terminated_length": 923.1783447265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.40828938255820146, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11657960379917942, "kl": 0.0164947509765625, "learning_rate": 7.640910956716437e-07, "loss": 0.0986, "num_tokens": 1109859152.0, "reward": 1.465959906578064, "reward_std": 0.3952101469039917, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.23859341442584991, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1006.3035888671875, "completions/mean_terminated_length": 800.1925048828125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4085024772255074, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13964906422790405, "kl": 0.01837158203125, "learning_rate": 7.637965842687404e-07, "loss": 0.0816, "num_tokens": 1110381368.0, "reward": 1.4670759439468384, "reward_std": 0.3010718822479248, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.1591140478849411, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 983.74560546875, "completions/mean_terminated_length": 766.3171997070312, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4087155718928134, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13604018468250148, "kl": 0.019744873046875, "learning_rate": 7.635019545457923e-07, "loss": 0.0765, "num_tokens": 1110890566.0, "reward": 1.454241156578064, "reward_std": 0.29963263869285583, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.18175934255123138, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1196.1273193359375, "completions/mean_terminated_length": 922.2212524414062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40892866656011934, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1177594379251397, "kl": 0.016845703125, "learning_rate": 7.632072066658549e-07, "loss": 0.0621, "num_tokens": 1111503631.0, "reward": 1.31640625, "reward_std": 0.35845157504081726, "rewards/accuracy_reward/mean": 0.40509259700775146, "rewards/accuracy_reward/std": 0.49147915840148926, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9235491156578064, "rewards/tag_count_reward/std": 0.2190280258655548, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 933.2053833007812, "completions/mean_terminated_length": 783.6253051757812, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4091417612274253, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11169386231734406, "kl": 0.020538330078125, "learning_rate": 7.62912340792048e-07, "loss": 0.0108, "num_tokens": 1111982843.0, "reward": 1.5491071939468384, "reward_std": 0.2688133418560028, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.1424395591020584, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1150.5335693359375, "completions/mean_terminated_length": 899.2428588867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.40935485589473125, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11669236221881746, "kl": 0.0157012939453125, "learning_rate": 7.626173570875576e-07, "loss": 0.0829, "num_tokens": 1112570858.0, "reward": 1.3465402126312256, "reward_std": 0.33845916390419006, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.1933760941028595, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 974.4397583007812, "completions/mean_terminated_length": 805.2222290039062, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.4095679505620372, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13847781885523142, "kl": 0.018341064453125, "learning_rate": 7.623222557156344e-07, "loss": 0.1179, "num_tokens": 1113073791.0, "reward": 1.5920759439468384, "reward_std": 0.38161033391952515, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19768577814102173, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1138.43310546875, "completions/mean_terminated_length": 893.6487426757812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4097810452293431, "frac_reward_zero_std": 0.0, "grad_norm": 3.1033313550511004, "kl": 0.077972412109375, "learning_rate": 7.620270368395947e-07, "loss": 0.0734, "num_tokens": 1113657489.0, "reward": 1.4017857313156128, "reward_std": 0.3582235276699066, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18152911961078644, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1050.703125, "completions/mean_terminated_length": 859.7313842773438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.40999413989664907, "frac_reward_zero_std": 0.0, "grad_norm": 0.13789160292773583, "kl": 0.01885986328125, "learning_rate": 7.617317006228193e-07, "loss": 0.1267, "num_tokens": 1114195644.0, "reward": 1.4575893878936768, "reward_std": 0.3317590653896332, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.16865228116512299, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1090.140625, "completions/mean_terminated_length": 927.5796508789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.410207234563955, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13140616526494253, "kl": 0.017974853515625, "learning_rate": 7.614362472287543e-07, "loss": 0.1185, "num_tokens": 1114750507.0, "reward": 1.5775669813156128, "reward_std": 0.3307828903198242, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.1600995808839798, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1103.2701416015625, "completions/mean_terminated_length": 919.3626708984375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.410420329231261, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11497199331048603, "kl": 0.01812744140625, "learning_rate": 7.611406768209105e-07, "loss": 0.0985, "num_tokens": 1115311012.0, "reward": 1.4168527126312256, "reward_std": 0.2822326123714447, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18409591913223267, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1101.82373046875, "completions/mean_terminated_length": 847.1869506835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41063342389856694, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12747365124474375, "kl": 0.01995849609375, "learning_rate": 7.608449895628636e-07, "loss": 0.0669, "num_tokens": 1115870309.0, "reward": 1.4821429252624512, "reward_std": 0.33656126260757446, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9263392686843872, "rewards/tag_count_reward/std": 0.21254922449588776, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 970.497802734375, "completions/mean_terminated_length": 835.1331787109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4108465185658729, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12675855682400994, "kl": 0.0205078125, "learning_rate": 7.605491856182537e-07, "loss": 0.0337, "num_tokens": 1116367428.0, "reward": 1.5731027126312256, "reward_std": 0.25458186864852905, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.15814605355262756, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 996.8147583007812, "completions/mean_terminated_length": 843.5728759765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41105961323317886, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12126914495141694, "kl": 0.019561767578125, "learning_rate": 7.602532651507858e-07, "loss": 0.0526, "num_tokens": 1116881601.0, "reward": 1.5920759439468384, "reward_std": 0.26761388778686523, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1746300309896469, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1064.1875, "completions/mean_terminated_length": 863.1935424804688, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4112727079004848, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11458607807511013, "kl": 0.019317626953125, "learning_rate": 7.599572283242291e-07, "loss": 0.0365, "num_tokens": 1117422437.0, "reward": 1.4799107313156128, "reward_std": 0.2579093873500824, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.19500340521335602, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1065.5023193359375, "completions/mean_terminated_length": 828.7229614257812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4114858025677907, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12368527621923536, "kl": 0.0172576904296875, "learning_rate": 7.596610753024174e-07, "loss": 0.0745, "num_tokens": 1117963654.0, "reward": 1.5055804252624512, "reward_std": 0.33166131377220154, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.19743064045906067, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1040.33935546875, "completions/mean_terminated_length": 840.9625854492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4116988972350967, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13770041230205, "kl": 0.018218994140625, "learning_rate": 7.593648062492486e-07, "loss": 0.0743, "num_tokens": 1118510462.0, "reward": 1.4174107313156128, "reward_std": 0.3346589505672455, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18684300780296326, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1000.0513916015625, "completions/mean_terminated_length": 818.9921264648438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.41191199190240263, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1318098802807133, "kl": 0.019622802734375, "learning_rate": 7.590684213286852e-07, "loss": 0.131, "num_tokens": 1119025765.0, "reward": 1.4838169813156128, "reward_std": 0.34468933939933777, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.1652565598487854, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 936.466552734375, "completions/mean_terminated_length": 747.8250732421875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4121250865697086, "frac_reward_zero_std": 0.25, "grad_norm": 0.11760137475832275, "kl": 0.02081298828125, "learning_rate": 7.587719207047534e-07, "loss": 0.0443, "num_tokens": 1119511462.0, "reward": 1.4670759439468384, "reward_std": 0.24830979108810425, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.15255293250083923, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1042.984375, "completions/mean_terminated_length": 881.5569458007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41233818123701454, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13707341835411435, "kl": 0.01953125, "learning_rate": 7.584753045415436e-07, "loss": 0.0551, "num_tokens": 1120053503.0, "reward": 1.4804688692092896, "reward_std": 0.30049338936805725, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.16282889246940613, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1118.8773193359375, "completions/mean_terminated_length": 885.298828125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.4125512759043205, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12387350751807251, "kl": 0.0186767578125, "learning_rate": 7.581785730032102e-07, "loss": 0.0891, "num_tokens": 1120624888.0, "reward": 1.4765626192092896, "reward_std": 0.3474372923374176, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.21554414927959442, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1040.5513916015625, "completions/mean_terminated_length": 878.733154296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.41276437057162646, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12335153143584317, "kl": 0.01910400390625, "learning_rate": 7.578817262539713e-07, "loss": 0.0834, "num_tokens": 1121162815.0, "reward": 1.5809152126312256, "reward_std": 0.3612014353275299, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.47195106744766235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19048160314559937, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 981.0491333007812, "completions/mean_terminated_length": 819.2236328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4129774652389324, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12399434327209002, "kl": 0.0203857421875, "learning_rate": 7.575847644581089e-07, "loss": 0.0587, "num_tokens": 1121666741.0, "reward": 1.4988839626312256, "reward_std": 0.283370703458786, "rewards/accuracy_reward/mean": 0.5717592835426331, "rewards/accuracy_reward/std": 0.49539753794670105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.18175935745239258, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1121.140625, "completions/mean_terminated_length": 881.6151733398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4131905599062383, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11752852066908871, "kl": 0.0165252685546875, "learning_rate": 7.572876877799686e-07, "loss": 0.104, "num_tokens": 1122246404.0, "reward": 1.4871652126312256, "reward_std": 0.3367440402507782, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17399263381958008, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1100.0335693359375, "completions/mean_terminated_length": 838.059814453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4134036545735443, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11903454038818558, "kl": 0.016693115234375, "learning_rate": 7.569904963839598e-07, "loss": 0.074, "num_tokens": 1122809203.0, "reward": 1.419084906578064, "reward_std": 0.3181091248989105, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.16924987733364105, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 955.950927734375, "completions/mean_terminated_length": 746.8350830078125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.41361674924085023, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14566865833926723, "kl": 0.020263671875, "learning_rate": 7.566931904345548e-07, "loss": 0.0788, "num_tokens": 1123305229.0, "reward": 1.563616156578064, "reward_std": 0.32548803091049194, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1611691564321518, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 982.5178833007812, "completions/mean_terminated_length": 754.406494140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4138298439081562, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13735596938993327, "kl": 0.019866943359375, "learning_rate": 7.563957700962899e-07, "loss": 0.1024, "num_tokens": 1123817813.0, "reward": 1.5301339626312256, "reward_std": 0.2228216826915741, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16823354363441467, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 997.7857666015625, "completions/mean_terminated_length": 806.5858154296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41404293857546215, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1164040307777098, "kl": 0.01806640625, "learning_rate": 7.560982355337647e-07, "loss": 0.0168, "num_tokens": 1124337957.0, "reward": 1.4704241752624512, "reward_std": 0.2959195673465729, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.1500861495733261, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 991.4553833007812, "completions/mean_terminated_length": 785.7813110351562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4142560332427681, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1472234281520342, "kl": 0.020904541015625, "learning_rate": 7.558005869116416e-07, "loss": 0.1302, "num_tokens": 1124852385.0, "reward": 1.6049107313156128, "reward_std": 0.22037231922149658, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19644489884376526, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1034.810302734375, "completions/mean_terminated_length": 856.6378173828125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.41446912791007406, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12173272089035926, "kl": 0.017486572265625, "learning_rate": 7.555028243946462e-07, "loss": 0.0594, "num_tokens": 1125389708.0, "reward": 1.4642857313156128, "reward_std": 0.3436257541179657, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.2068338543176651, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1104.75, "completions/mean_terminated_length": 864.313720703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.41468222257738, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11731568543574519, "kl": 0.01654052734375, "learning_rate": 7.552049481475674e-07, "loss": 0.0976, "num_tokens": 1125953708.0, "reward": 1.3856027126312256, "reward_std": 0.27327868342399597, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.19285248219966888, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1063.482177734375, "completions/mean_terminated_length": 832.9476928710938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.414895317244686, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1302797180316065, "kl": 0.01806640625, "learning_rate": 7.54906958335257e-07, "loss": 0.0381, "num_tokens": 1126498932.0, "reward": 1.5368304252624512, "reward_std": 0.2780376672744751, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17065013945102692, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1128.3013916015625, "completions/mean_terminated_length": 857.17626953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4151084119119919, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12062219590206154, "kl": 0.017547607421875, "learning_rate": 7.546088551226294e-07, "loss": 0.0467, "num_tokens": 1127072187.0, "reward": 1.4497768878936768, "reward_std": 0.3176611661911011, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1810987889766693, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1018.7857666015625, "completions/mean_terminated_length": 847.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.41532150657929784, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13871092067936677, "kl": 0.019622802734375, "learning_rate": 7.543106386746619e-07, "loss": 0.0591, "num_tokens": 1127594075.0, "reward": 1.3934152126312256, "reward_std": 0.27748817205429077, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.16975806653499603, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1171.1875, "completions/mean_terminated_length": 941.4873046875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4155346012466038, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2833425937338556, "kl": 0.0170440673828125, "learning_rate": 7.540123091563947e-07, "loss": 0.0874, "num_tokens": 1128193791.0, "reward": 1.4375001192092896, "reward_std": 0.31718990206718445, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.1964321881532669, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1016.700927734375, "completions/mean_terminated_length": 806.00537109375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.41574769591390975, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12241231150081495, "kl": 0.021484375, "learning_rate": 7.537138667329302e-07, "loss": 0.0955, "num_tokens": 1128715785.0, "reward": 1.5742188692092896, "reward_std": 0.2853057384490967, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15551921725273132, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1203.185302734375, "completions/mean_terminated_length": 921.5803833007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4159607905812157, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10668216636883829, "kl": 0.016510009765625, "learning_rate": 7.534153115694332e-07, "loss": 0.0617, "num_tokens": 1129334956.0, "reward": 1.3666294813156128, "reward_std": 0.31266745924949646, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19121423363685608, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1099.321533203125, "completions/mean_terminated_length": 908.5684204101562, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.41617388524852167, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11651819198423108, "kl": 0.0162353515625, "learning_rate": 7.531166438311314e-07, "loss": 0.0126, "num_tokens": 1129894780.0, "reward": 1.4492188692092896, "reward_std": 0.3175099790096283, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.1619679182767868, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1059.227783203125, "completions/mean_terminated_length": 831.0494995117188, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4163869799158276, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1270686075033123, "kl": 0.018096923828125, "learning_rate": 7.528178636833145e-07, "loss": 0.0965, "num_tokens": 1130441410.0, "reward": 1.4196429252624512, "reward_std": 0.2730393707752228, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.1719365119934082, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1023.88623046875, "completions/mean_terminated_length": 791.0054931640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4166000745831336, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 41.52921408668382, "kl": 3.1875, "learning_rate": 7.525189712913346e-07, "loss": 0.2041, "num_tokens": 1130974031.0, "reward": 1.3883929252624512, "reward_std": 0.31834521889686584, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9129464030265808, "rewards/tag_count_reward/std": 0.24453723430633545, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1060.21435546875, "completions/mean_terminated_length": 835.5945434570312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4168131692504395, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.47089650360723145, "kl": 0.018890380859375, "learning_rate": 7.522199668206056e-07, "loss": 0.1222, "num_tokens": 1131527151.0, "reward": 1.5217634439468384, "reward_std": 0.2847650945186615, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16055120527744293, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 945.4241333007812, "completions/mean_terminated_length": 794.3096313476562, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.41702626391774544, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13266296706410222, "kl": 0.021728515625, "learning_rate": 7.519208504366035e-07, "loss": 0.0605, "num_tokens": 1132018781.0, "reward": 1.5658482313156128, "reward_std": 0.29443827271461487, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.14178510010242462, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1103.0848388671875, "completions/mean_terminated_length": 831.5574951171875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.4172393585850514, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1234201463301303, "kl": 0.01806640625, "learning_rate": 7.516216223048663e-07, "loss": 0.0782, "num_tokens": 1132581107.0, "reward": 1.4497768878936768, "reward_std": 0.3119356334209442, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1740114688873291, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1184.625, "completions/mean_terminated_length": 917.0292358398438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.41745245325235736, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1325135093578888, "kl": 0.020782470703125, "learning_rate": 7.513222825909942e-07, "loss": 0.0257, "num_tokens": 1133190603.0, "reward": 1.32421875, "reward_std": 0.31454217433929443, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2270708829164505, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 982.0960083007812, "completions/mean_terminated_length": 777.9866943359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4176655479196633, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13794098097949525, "kl": 0.020111083984375, "learning_rate": 7.510228314606484e-07, "loss": 0.0564, "num_tokens": 1133702182.0, "reward": 1.4252232313156128, "reward_std": 0.3640156090259552, "rewards/accuracy_reward/mean": 0.49537035822868347, "rewards/accuracy_reward/std": 0.5005581974983215, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18416117131710052, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1165.388427734375, "completions/mean_terminated_length": 891.8304443359375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4178786425869693, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12164675538176777, "kl": 0.017425537109375, "learning_rate": 7.507232690795525e-07, "loss": 0.1106, "num_tokens": 1134294964.0, "reward": 1.4202009439468384, "reward_std": 0.38431569933891296, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18972641229629517, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 910.8616333007812, "completions/mean_terminated_length": 748.4132690429688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.41809173725427523, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13209947562323152, "kl": 0.021209716796875, "learning_rate": 7.504235956134911e-07, "loss": 0.0757, "num_tokens": 1134771094.0, "reward": 1.5803571939468384, "reward_std": 0.3169344961643219, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18066051602363586, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1074.1585693359375, "completions/mean_terminated_length": 878.3458862304688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4183048319215812, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13444507963379274, "kl": 0.01910400390625, "learning_rate": 7.501238112283109e-07, "loss": 0.141, "num_tokens": 1135320349.0, "reward": 1.5658482313156128, "reward_std": 0.3748548924922943, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.17649045586585999, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1048.930908203125, "completions/mean_terminated_length": 838.3162231445312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.4185179265888871, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11863590802955154, "kl": 0.019378662109375, "learning_rate": 7.49823916089919e-07, "loss": 0.109, "num_tokens": 1135859838.0, "reward": 1.657366156578064, "reward_std": 0.35045763850212097, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45011183619499207, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19175945222377777, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1060.5692138671875, "completions/mean_terminated_length": 855.6307373046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.41873102125619305, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12687230708668706, "kl": 0.01910400390625, "learning_rate": 7.495239103642849e-07, "loss": 0.0712, "num_tokens": 1136398029.0, "reward": 1.4720982313156128, "reward_std": 0.35651785135269165, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17213605344295502, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1001.3460083007812, "completions/mean_terminated_length": 854.86767578125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.418944115923499, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13707076088885917, "kl": 0.018890380859375, "learning_rate": 7.492237942174387e-07, "loss": 0.068, "num_tokens": 1136917496.0, "reward": 1.5039063692092896, "reward_std": 0.3045034408569336, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17107665538787842, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1026.74560546875, "completions/mean_terminated_length": 811.4541015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.41915721059080496, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11670777666372437, "kl": 0.02032470703125, "learning_rate": 7.489235678154718e-07, "loss": 0.0766, "num_tokens": 1137447798.0, "reward": 1.5379464626312256, "reward_std": 0.2798476219177246, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17063917219638824, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1133.8929443359375, "completions/mean_terminated_length": 850.5731201171875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4193703052581109, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13003822700247553, "kl": 0.0196533203125, "learning_rate": 7.486232313245362e-07, "loss": 0.0691, "num_tokens": 1138022422.0, "reward": 1.4525669813156128, "reward_std": 0.31875282526016235, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9146205186843872, "rewards/tag_count_reward/std": 0.23133563995361328, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1124.5982666015625, "completions/mean_terminated_length": 911.5054931640625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4195833999254169, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12400522762694725, "kl": 0.017333984375, "learning_rate": 7.483227849108455e-07, "loss": 0.0996, "num_tokens": 1138591650.0, "reward": 1.4497768878936768, "reward_std": 0.363167405128479, "rewards/accuracy_reward/mean": 0.5162037014961243, "rewards/accuracy_reward/std": 0.5003167986869812, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17032796144485474, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1005.4420166015625, "completions/mean_terminated_length": 825.3141479492188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.41979649459272284, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12795603506722777, "kl": 0.02056884765625, "learning_rate": 7.480222287406737e-07, "loss": 0.0584, "num_tokens": 1139115816.0, "reward": 1.497209906578064, "reward_std": 0.2912636399269104, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15101487934589386, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1050.7545166015625, "completions/mean_terminated_length": 875.3858032226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4200095892600288, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11754554426197114, "kl": 0.018280029296875, "learning_rate": 7.477215629803555e-07, "loss": 0.0588, "num_tokens": 1139662618.0, "reward": 1.5463169813156128, "reward_std": 0.3298322856426239, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.15702906250953674, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 996.66748046875, "completions/mean_terminated_length": 855.6025390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4202226839273347, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11588709542951056, "kl": 0.01788330078125, "learning_rate": 7.474207877962866e-07, "loss": 0.0479, "num_tokens": 1140177109.0, "reward": 1.4648438692092896, "reward_std": 0.28817933797836304, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14392071962356567, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1120.765625, "completions/mean_terminated_length": 884.4118041992188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.42043577859464065, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11929480171024315, "kl": 0.0176239013671875, "learning_rate": 7.471199033549228e-07, "loss": 0.0589, "num_tokens": 1140747244.0, "reward": 1.4458706378936768, "reward_std": 0.3368956446647644, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17670516669750214, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1132.2366943359375, "completions/mean_terminated_length": 895.5786743164062, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.4206488732619466, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11648480572944128, "kl": 0.018585205078125, "learning_rate": 7.468189098227809e-07, "loss": 0.0627, "num_tokens": 1141320438.0, "reward": 1.4977679252624512, "reward_std": 0.33273303508758545, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.17423014342784882, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1084.4241943359375, "completions/mean_terminated_length": 811.0888671875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.42086196792925257, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1190998217571097, "kl": 0.017852783203125, "learning_rate": 7.465178073664373e-07, "loss": 0.1049, "num_tokens": 1141872100.0, "reward": 1.5039063692092896, "reward_std": 0.3167956471443176, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16314294934272766, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1058.57373046875, "completions/mean_terminated_length": 820.1246337890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4210750625965585, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12479343437162574, "kl": 0.0181884765625, "learning_rate": 7.462165961525298e-07, "loss": 0.0876, "num_tokens": 1142413205.0, "reward": 1.532366156578064, "reward_std": 0.2935563623905182, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.19385728240013123, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1071.1317138671875, "completions/mean_terminated_length": 835.7091064453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4212881572638645, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1327260926365753, "kl": 0.019073486328125, "learning_rate": 7.459152763477552e-07, "loss": 0.0928, "num_tokens": 1142966272.0, "reward": 1.450334906578064, "reward_std": 0.32430052757263184, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18636047840118408, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 983.6585083007812, "completions/mean_terminated_length": 803.026123046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.42150125193117044, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11774500868802211, "kl": 0.02117919921875, "learning_rate": 7.456138481188713e-07, "loss": 0.043, "num_tokens": 1143480343.0, "reward": 1.5351563692092896, "reward_std": 0.2810564637184143, "rewards/accuracy_reward/mean": 0.6064814925193787, "rewards/accuracy_reward/std": 0.4890965521335602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17189201712608337, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 957.6652221679688, "completions/mean_terminated_length": 850.7696533203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4217143465984764, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13647354888474048, "kl": 0.020111083984375, "learning_rate": 7.453123116326955e-07, "loss": 0.0658, "num_tokens": 1143986529.0, "reward": 1.5446429252624512, "reward_std": 0.26959431171417236, "rewards/accuracy_reward/mean": 0.6064814925193787, "rewards/accuracy_reward/std": 0.4890965521335602, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15285643935203552, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1085.3951416015625, "completions/mean_terminated_length": 866.5014038085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4219274412657823, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12794814428472606, "kl": 0.0171966552734375, "learning_rate": 7.450106670561049e-07, "loss": 0.1046, "num_tokens": 1144553058.0, "reward": 1.3621652126312256, "reward_std": 0.35093072056770325, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.18826662003993988, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 951.3973388671875, "completions/mean_terminated_length": 768.6302490234375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.42214053593308826, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1332421447932563, "kl": 0.021636962890625, "learning_rate": 7.44708914556037e-07, "loss": 0.1182, "num_tokens": 1145049684.0, "reward": 1.5418527126312256, "reward_std": 0.3753419816493988, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19447050988674164, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1060.4888916015625, "completions/mean_terminated_length": 889.8717651367188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4223536306003942, "frac_reward_zero_std": 0.0, "grad_norm": 0.12549930456168193, "kl": 0.019744873046875, "learning_rate": 7.444070542994886e-07, "loss": 0.0739, "num_tokens": 1145597439.0, "reward": 1.5747768878936768, "reward_std": 0.35021594166755676, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.15552422404289246, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1005.0803833007812, "completions/mean_terminated_length": 821.6798095703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.42256672526770017, "frac_reward_zero_std": 0.0, "grad_norm": 0.1237000475195039, "kl": 0.018402099609375, "learning_rate": 7.441050864535161e-07, "loss": 0.051, "num_tokens": 1146110611.0, "reward": 1.4453126192092896, "reward_std": 0.34513217210769653, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1748131364583969, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 991.560302734375, "completions/mean_terminated_length": 815.4869995117188, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.42277981993500613, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11668856977376964, "kl": 0.020263671875, "learning_rate": 7.438030111852359e-07, "loss": 0.1034, "num_tokens": 1146622206.0, "reward": 1.5318081378936768, "reward_std": 0.31090766191482544, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1423770934343338, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1037.3929443359375, "completions/mean_terminated_length": 810.97265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4229929146023121, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13429306115904246, "kl": 0.017608642578125, "learning_rate": 7.435008286618234e-07, "loss": 0.1738, "num_tokens": 1147164030.0, "reward": 1.520647406578064, "reward_std": 0.3790675103664398, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18750248849391937, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 943.8683471679688, "completions/mean_terminated_length": 756.4830322265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.42320600926961804, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13749421301061082, "kl": 0.020660400390625, "learning_rate": 7.431985390505134e-07, "loss": 0.055, "num_tokens": 1147653939.0, "reward": 1.4397321939468384, "reward_std": 0.23235628008842468, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.13773420453071594, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1083.0045166015625, "completions/mean_terminated_length": 873.2228393554688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.423419103936924, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11362423124964294, "kl": 0.0174560546875, "learning_rate": 7.428961425186002e-07, "loss": 0.0607, "num_tokens": 1148203141.0, "reward": 1.5239956378936768, "reward_std": 0.29532772302627563, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.14691685140132904, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 990.4777221679688, "completions/mean_terminated_length": 854.6246337890625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4236321986042299, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11907821825803847, "kl": 0.01910400390625, "learning_rate": 7.425936392334368e-07, "loss": 0.0932, "num_tokens": 1148715003.0, "reward": 1.6668527126312256, "reward_std": 0.31848883628845215, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45739173889160156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14266617596149445, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1161.6451416015625, "completions/mean_terminated_length": 926.2853393554688, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.42384529327153586, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.9161053668214424, "kl": 0.032989501953125, "learning_rate": 7.42291029362436e-07, "loss": 0.059, "num_tokens": 1149301356.0, "reward": 1.3939732313156128, "reward_std": 0.37136876583099365, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17708362638950348, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1083.154052734375, "completions/mean_terminated_length": 798.7196655273438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4240583879388418, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12687526118672238, "kl": 0.018646240234375, "learning_rate": 7.419883130730691e-07, "loss": 0.0219, "num_tokens": 1149857345.0, "reward": 1.387834906578064, "reward_std": 0.28479573130607605, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19084173440933228, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1039.2835693359375, "completions/mean_terminated_length": 826.6351318359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4242714826061478, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13878997570498752, "kl": 0.020416259765625, "learning_rate": 7.416854905328664e-07, "loss": 0.087, "num_tokens": 1150388160.0, "reward": 1.4168527126312256, "reward_std": 0.3089248239994049, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.19362127780914307, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1129.7076416015625, "completions/mean_terminated_length": 852.0842895507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.42448457727345373, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11262305235215861, "kl": 0.0171356201171875, "learning_rate": 7.41382561909417e-07, "loss": 0.0245, "num_tokens": 1150957037.0, "reward": 1.4352679252624512, "reward_std": 0.3097078502178192, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1884397715330124, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1129.60498046875, "completions/mean_terminated_length": 923.84423828125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.4246976719407597, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12579420887970502, "kl": 0.01806640625, "learning_rate": 7.410795273703685e-07, "loss": 0.0862, "num_tokens": 1151539900.0, "reward": 1.3694196939468384, "reward_std": 0.3807280659675598, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.2142428457736969, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 987.8973388671875, "completions/mean_terminated_length": 833.35546875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.42491076660806565, "frac_reward_zero_std": 0.0, "grad_norm": 0.12536190974543424, "kl": 0.02001953125, "learning_rate": 7.407763870834275e-07, "loss": 0.1066, "num_tokens": 1152047118.0, "reward": 1.618303656578064, "reward_std": 0.36230963468551636, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1615753173828125, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 916.5558471679688, "completions/mean_terminated_length": 748.289794921875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4251238612753716, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14486461726569597, "kl": 0.019439697265625, "learning_rate": 7.40473141216359e-07, "loss": 0.1041, "num_tokens": 1152526279.0, "reward": 1.481584906578064, "reward_std": 0.3407151401042938, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17107665538787842, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1089.665283203125, "completions/mean_terminated_length": 852.0835571289062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4253369559426775, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1179002351996685, "kl": 0.018524169921875, "learning_rate": 7.401697899369863e-07, "loss": 0.0471, "num_tokens": 1153082705.0, "reward": 1.4263393878936768, "reward_std": 0.2941608428955078, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.17920349538326263, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1072.46875, "completions/mean_terminated_length": 885.6648559570312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.42555005060998347, "frac_reward_zero_std": 0.0, "grad_norm": 0.12089811651514756, "kl": 0.018157958984375, "learning_rate": 7.398663334131913e-07, "loss": 0.0221, "num_tokens": 1153643779.0, "reward": 1.4955357313156128, "reward_std": 0.3620457351207733, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1853403002023697, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1070.0960693359375, "completions/mean_terminated_length": 799.8489990234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.4257631452772894, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12848578540188574, "kl": 0.020416259765625, "learning_rate": 7.395627718129136e-07, "loss": 0.0874, "num_tokens": 1154191758.0, "reward": 1.4045759439468384, "reward_std": 0.33130455017089844, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15974830090999603, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 955.8638916015625, "completions/mean_terminated_length": 757.0316772460938, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4259762399445954, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13618537501844835, "kl": 0.02056884765625, "learning_rate": 7.392591053041516e-07, "loss": 0.104, "num_tokens": 1154686001.0, "reward": 1.532366156578064, "reward_std": 0.3016572892665863, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.16701211035251617, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1081.5357666015625, "completions/mean_terminated_length": 824.9039306640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.42618933461190134, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1258082510947054, "kl": 0.01617431640625, "learning_rate": 7.389553340549612e-07, "loss": 0.0431, "num_tokens": 1155245553.0, "reward": 1.4079241752624512, "reward_std": 0.2892843186855316, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1900748312473297, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1055.4442138671875, "completions/mean_terminated_length": 849.4420166015625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4264024292792073, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12357926587036605, "kl": 0.01995849609375, "learning_rate": 7.386514582334569e-07, "loss": 0.0335, "num_tokens": 1155793176.0, "reward": 1.4709821939468384, "reward_std": 0.3795382082462311, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19061346352100372, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1066.2098388671875, "completions/mean_terminated_length": 812.48876953125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.42661552394651325, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12394146871752577, "kl": 0.019683837890625, "learning_rate": 7.383474780078104e-07, "loss": 0.0991, "num_tokens": 1156339782.0, "reward": 1.4988839626312256, "reward_std": 0.2770690321922302, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.194654181599617, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1065.4129638671875, "completions/mean_terminated_length": 877.2579345703125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4268286186138192, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12136924180244796, "kl": 0.018798828125, "learning_rate": 7.380433935462517e-07, "loss": 0.0727, "num_tokens": 1156892271.0, "reward": 1.4603794813156128, "reward_std": 0.30810728669166565, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18297357857227325, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 940.41748046875, "completions/mean_terminated_length": 749.0549926757812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4270417132811251, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1378721192781175, "kl": 0.021697998046875, "learning_rate": 7.377392050170679e-07, "loss": 0.125, "num_tokens": 1157385594.0, "reward": 1.4860491752624512, "reward_std": 0.32827243208885193, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19084827601909637, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1046.669677734375, "completions/mean_terminated_length": 822.3278198242188, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.42725480794843107, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11664888552883391, "kl": 0.020172119140625, "learning_rate": 7.374349125886046e-07, "loss": 0.0578, "num_tokens": 1157920134.0, "reward": 1.3532366752624512, "reward_std": 0.3162634074687958, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19194407761096954, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1125.2098388671875, "completions/mean_terminated_length": 870.1937255859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.427467902615737, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11551836673067618, "kl": 0.0179443359375, "learning_rate": 7.37130516429264e-07, "loss": 0.0957, "num_tokens": 1158495684.0, "reward": 1.4168527126312256, "reward_std": 0.3748857080936432, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20008359849452972, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1173.5379638671875, "completions/mean_terminated_length": 941.336181640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.427680997283043, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10907505663280373, "kl": 0.01556396484375, "learning_rate": 7.368260167075061e-07, "loss": 0.0689, "num_tokens": 1159089429.0, "reward": 1.4090402126312256, "reward_std": 0.36002999544143677, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.18208445608615875, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1072.796875, "completions/mean_terminated_length": 870.3961791992188, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.42789409195034894, "frac_reward_zero_std": 0.0, "grad_norm": 0.12994434872577318, "kl": 0.0189208984375, "learning_rate": 7.365214135918485e-07, "loss": 0.122, "num_tokens": 1159641050.0, "reward": 1.4391741752624512, "reward_std": 0.41247203946113586, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.1966029405593872, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1002.7433471679688, "completions/mean_terminated_length": 812.4459228515625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.4281071866176549, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13513243794580246, "kl": 0.020660400390625, "learning_rate": 7.362167072508652e-07, "loss": 0.0955, "num_tokens": 1160163303.0, "reward": 1.4235491752624512, "reward_std": 0.3787241578102112, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9079241156578064, "rewards/tag_count_reward/std": 0.2347799688577652, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 975.7210083007812, "completions/mean_terminated_length": 803.4896240234375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.42832028128496086, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13040082486508398, "kl": 0.020111083984375, "learning_rate": 7.359118978531883e-07, "loss": 0.0823, "num_tokens": 1160662954.0, "reward": 1.5731027126312256, "reward_std": 0.2644284963607788, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14470793306827545, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1021.2723388671875, "completions/mean_terminated_length": 811.5107421875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4285333759522668, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12310608045039467, "kl": 0.018951416015625, "learning_rate": 7.356069855675061e-07, "loss": 0.0584, "num_tokens": 1161185492.0, "reward": 1.481584906578064, "reward_std": 0.3446776270866394, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1390950083732605, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1105.2388916015625, "completions/mean_terminated_length": 854.901123046875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4287464706195728, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11552040829679966, "kl": 0.016998291015625, "learning_rate": 7.353019705625645e-07, "loss": 0.0958, "num_tokens": 1161753599.0, "reward": 1.4129464626312256, "reward_std": 0.34539273381233215, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.20407520234584808, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1007.8348388671875, "completions/mean_terminated_length": 785.1436767578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4289595652868787, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12037030021667264, "kl": 0.017822265625, "learning_rate": 7.349968530071658e-07, "loss": 0.073, "num_tokens": 1162271909.0, "reward": 1.5195313692092896, "reward_std": 0.2649814486503601, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.15725943446159363, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1021.3460083007812, "completions/mean_terminated_length": 814.9142456054688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.42917265995418463, "frac_reward_zero_std": 0.0, "grad_norm": 0.1220540121741869, "kl": 0.017608642578125, "learning_rate": 7.346916330701693e-07, "loss": 0.0484, "num_tokens": 1162808448.0, "reward": 1.4414063692092896, "reward_std": 0.37261244654655457, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19942738115787506, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1046.88623046875, "completions/mean_terminated_length": 842.3575439453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4293857546214906, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12212174750647749, "kl": 0.01708984375, "learning_rate": 7.343863109204909e-07, "loss": 0.0924, "num_tokens": 1163341693.0, "reward": 1.3945313692092896, "reward_std": 0.302133709192276, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17378443479537964, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1000.4777221679688, "completions/mean_terminated_length": 799.8882446289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.42959884928879655, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1277072093180643, "kl": 0.01904296875, "learning_rate": 7.34080886727103e-07, "loss": 0.0691, "num_tokens": 1163856915.0, "reward": 1.5680804252624512, "reward_std": 0.2932305932044983, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1540725976228714, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1050.8773193359375, "completions/mean_terminated_length": 850.3834228515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4298119439561025, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12653991887985958, "kl": 0.018463134765625, "learning_rate": 7.337753606590344e-07, "loss": 0.1054, "num_tokens": 1164400988.0, "reward": 1.4748884439468384, "reward_std": 0.295178085565567, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17706511914730072, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 985.47998046875, "completions/mean_terminated_length": 845.9570922851562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.43002503862340846, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11480390835647009, "kl": 0.019256591796875, "learning_rate": 7.334697328853706e-07, "loss": 0.0401, "num_tokens": 1164909043.0, "reward": 1.4916294813156128, "reward_std": 0.32447755336761475, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14720547199249268, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 977.4107666015625, "completions/mean_terminated_length": 815.0333862304688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4302381332907144, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13233413273450828, "kl": 0.018798828125, "learning_rate": 7.331640035752528e-07, "loss": 0.0642, "num_tokens": 1165410907.0, "reward": 1.5005581378936768, "reward_std": 0.3394905626773834, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.17984239757061005, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1026.341552734375, "completions/mean_terminated_length": 852.9530029296875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.4304512279580204, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12585713938985338, "kl": 0.0206298828125, "learning_rate": 7.328581728978792e-07, "loss": 0.0762, "num_tokens": 1165940420.0, "reward": 1.5106027126312256, "reward_std": 0.3467922508716583, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.17637281119823456, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1086.71875, "completions/mean_terminated_length": 788.7777709960938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.4306643226253263, "frac_reward_zero_std": 0.0, "grad_norm": 0.127654545678264, "kl": 0.017822265625, "learning_rate": 7.325522410225035e-07, "loss": 0.0884, "num_tokens": 1166499094.0, "reward": 1.4882813692092896, "reward_std": 0.38874325156211853, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.20279188454151154, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1050.25, "completions/mean_terminated_length": 871.705322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.43087741729263224, "frac_reward_zero_std": 0.0, "grad_norm": 0.12945923712176072, "kl": 0.021331787109375, "learning_rate": 7.322462081184355e-07, "loss": 0.0495, "num_tokens": 1167042374.0, "reward": 1.5295759439468384, "reward_std": 0.3477681577205658, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.1743152141571045, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 996.950927734375, "completions/mean_terminated_length": 828.1295166015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4310905119599382, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1600873446032268, "kl": 0.020782470703125, "learning_rate": 7.319400743550411e-07, "loss": 0.0755, "num_tokens": 1167562304.0, "reward": 1.4414063692092896, "reward_std": 0.30521509051322937, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.20611964166164398, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1025.805908203125, "completions/mean_terminated_length": 806.9620971679688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.43130360662724415, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14637321611767606, "kl": 0.02008056640625, "learning_rate": 7.316338399017419e-07, "loss": 0.071, "num_tokens": 1168104873.0, "reward": 1.4910714626312256, "reward_std": 0.25788840651512146, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.1845843642950058, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1034.263427734375, "completions/mean_terminated_length": 833.6845092773438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4315167012945501, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12195401875072376, "kl": 0.018585205078125, "learning_rate": 7.313275049280152e-07, "loss": 0.1058, "num_tokens": 1168633519.0, "reward": 1.528459906578064, "reward_std": 0.32592836022377014, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18088066577911377, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 987.1563110351562, "completions/mean_terminated_length": 797.3211059570312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.43172979596185607, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13299690038928727, "kl": 0.02044677734375, "learning_rate": 7.310210696033939e-07, "loss": 0.0589, "num_tokens": 1169144405.0, "reward": 1.4235491752624512, "reward_std": 0.29363900423049927, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18336887657642365, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1074.5692138671875, "completions/mean_terminated_length": 888.1675415039062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.431942890629162, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12506611955202673, "kl": 0.02008056640625, "learning_rate": 7.307145340974666e-07, "loss": 0.0783, "num_tokens": 1169698132.0, "reward": 1.4843751192092896, "reward_std": 0.34263327717781067, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.16360238194465637, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 974.5491333007812, "completions/mean_terminated_length": 782.4579467773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.432155985296468, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1376999212339923, "kl": 0.02020263671875, "learning_rate": 7.304078985798773e-07, "loss": 0.0921, "num_tokens": 1170198106.0, "reward": 1.4592634439468384, "reward_std": 0.34764721989631653, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.1878284513950348, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1182.671875, "completions/mean_terminated_length": 858.83740234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4323690799637739, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12228657128802851, "kl": 0.018035888671875, "learning_rate": 7.30101163220325e-07, "loss": 0.0664, "num_tokens": 1170800055.0, "reward": 1.328125, "reward_std": 0.3932945430278778, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.24831648170948029, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1031.10498046875, "completions/mean_terminated_length": 849.13427734375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.43258217463107984, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1395244118199411, "kl": 0.021942138671875, "learning_rate": 7.297943281885644e-07, "loss": 0.0862, "num_tokens": 1171322118.0, "reward": 1.4397321939468384, "reward_std": 0.25645577907562256, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19568081200122833, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1008.1495971679688, "completions/mean_terminated_length": 809.0292358398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4327952692983858, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13529758386248378, "kl": 0.020904541015625, "learning_rate": 7.294873936544054e-07, "loss": 0.0859, "num_tokens": 1171848009.0, "reward": 1.547991156578064, "reward_std": 0.2893456518650055, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.4822678565979004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2177339345216751, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1153.493408203125, "completions/mean_terminated_length": 889.7947998046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.43300836396569176, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.2427688975096126, "kl": 0.04595947265625, "learning_rate": 7.291803597877126e-07, "loss": 0.1224, "num_tokens": 1172440262.0, "reward": 1.3459821939468384, "reward_std": 0.34488779306411743, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9040178656578064, "rewards/tag_count_reward/std": 0.23049293458461761, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1082.28125, "completions/mean_terminated_length": 797.589599609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4332214586329977, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13455096043180642, "kl": 0.018768310546875, "learning_rate": 7.288732267584058e-07, "loss": 0.12, "num_tokens": 1172989508.0, "reward": 1.4983259439468384, "reward_std": 0.3553817570209503, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9112723469734192, "rewards/tag_count_reward/std": 0.22639364004135132, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1058.5223388671875, "completions/mean_terminated_length": 859.5657348632812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.43343455330030367, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.4201079099197848, "kl": 0.04888916015625, "learning_rate": 7.285659947364592e-07, "loss": 0.0563, "num_tokens": 1173540046.0, "reward": 1.4916294813156128, "reward_std": 0.34383928775787354, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.20298877358436584, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 984.575927734375, "completions/mean_terminated_length": 813.766845703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.43364764796760963, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14342454283568976, "kl": 0.020294189453125, "learning_rate": 7.28258663891903e-07, "loss": 0.0836, "num_tokens": 1174047920.0, "reward": 1.4642857313156128, "reward_std": 0.3623078763484955, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18918029963970184, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1103.93310546875, "completions/mean_terminated_length": 879.6519775390625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4338607426349156, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12177834845138455, "kl": 0.016693115234375, "learning_rate": 7.279512343948207e-07, "loss": 0.0591, "num_tokens": 1174617410.0, "reward": 1.4838169813156128, "reward_std": 0.3620666265487671, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.19157950580120087, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1016.2745971679688, "completions/mean_terminated_length": 821.9708251953125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4340738373022215, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12778408639132097, "kl": 0.020233154296875, "learning_rate": 7.276437064153513e-07, "loss": 0.0931, "num_tokens": 1175146653.0, "reward": 1.5842634439468384, "reward_std": 0.3446699380874634, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.1929689645767212, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1051.3348388671875, "completions/mean_terminated_length": 831.3623657226562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.43428693196952745, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1240360589442704, "kl": 0.019622802734375, "learning_rate": 7.273360801236876e-07, "loss": 0.1176, "num_tokens": 1175677027.0, "reward": 1.5641741752624512, "reward_std": 0.3642991781234741, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21464671194553375, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1067.19873046875, "completions/mean_terminated_length": 824.04736328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4345000266368334, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1220292390746561, "kl": 0.017547607421875, "learning_rate": 7.270283556900776e-07, "loss": 0.0788, "num_tokens": 1176215644.0, "reward": 1.3783482313156128, "reward_std": 0.3106710612773895, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18725347518920898, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 961.9308471679688, "completions/mean_terminated_length": 840.6575317382812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.43471312130413936, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12516932710271833, "kl": 0.02056884765625, "learning_rate": 7.267205332848231e-07, "loss": 0.0519, "num_tokens": 1176711229.0, "reward": 1.6467634439468384, "reward_std": 0.3304532766342163, "rewards/accuracy_reward/mean": 0.6919642686843872, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.1490427851676941, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1014.2522583007812, "completions/mean_terminated_length": 819.567626953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4349262159714453, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12650198826089648, "kl": 0.01873779296875, "learning_rate": 7.264126130782803e-07, "loss": 0.0512, "num_tokens": 1177228654.0, "reward": 1.4827009439468384, "reward_std": 0.25383102893829346, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15974830090999603, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1123.8638916015625, "completions/mean_terminated_length": 865.105712890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4351393106387513, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12545230015232006, "kl": 0.01751708984375, "learning_rate": 7.261045952408593e-07, "loss": 0.0794, "num_tokens": 1177798209.0, "reward": 1.4056919813156128, "reward_std": 0.3215946555137634, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18260477483272552, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1020.07373046875, "completions/mean_terminated_length": 823.2366943359375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.43535240530605723, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13670143417797545, "kl": 0.018951416015625, "learning_rate": 7.257964799430245e-07, "loss": 0.0433, "num_tokens": 1178327890.0, "reward": 1.3950893878936768, "reward_std": 0.2512245178222656, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1724584698677063, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 997.4933471679688, "completions/mean_terminated_length": 789.6390380859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4355654999733632, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12882352029113336, "kl": 0.020172119140625, "learning_rate": 7.254882673552942e-07, "loss": 0.0843, "num_tokens": 1178839279.0, "reward": 1.5273438692092896, "reward_std": 0.2648398280143738, "rewards/accuracy_reward/mean": 0.6180555820465088, "rewards/accuracy_reward/std": 0.48642635345458984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.19038327038288116, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1101.024658203125, "completions/mean_terminated_length": 907.5564575195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4357785946406691, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13585145716587707, "kl": 0.019927978515625, "learning_rate": 7.251799576482403e-07, "loss": 0.0966, "num_tokens": 1179411290.0, "reward": 1.4162946939468384, "reward_std": 0.38530245423316956, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9207589030265808, "rewards/tag_count_reward/std": 0.2189916968345642, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 932.2857666015625, "completions/mean_terminated_length": 739.518310546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.43599168930797505, "frac_reward_zero_std": 0.0, "grad_norm": 0.1301197931461155, "kl": 0.021453857421875, "learning_rate": 7.248715509924888e-07, "loss": 0.0364, "num_tokens": 1179891946.0, "reward": 1.481584906578064, "reward_std": 0.2887004613876343, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.14997799694538116, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1151.149658203125, "completions/mean_terminated_length": 880.0087280273438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.436204783975281, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12083018928102943, "kl": 0.0165557861328125, "learning_rate": 7.24563047558719e-07, "loss": 0.1142, "num_tokens": 1180480109.0, "reward": 1.4252232313156128, "reward_std": 0.3353671431541443, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.2101718634366989, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1073.8460693359375, "completions/mean_terminated_length": 849.041259765625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.43641787864258696, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1186508653277658, "kl": 0.0177001953125, "learning_rate": 7.242544475176642e-07, "loss": 0.0815, "num_tokens": 1181025640.0, "reward": 1.5195313692092896, "reward_std": 0.2967233657836914, "rewards/accuracy_reward/mean": 0.5856481194496155, "rewards/accuracy_reward/std": 0.49318093061447144, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16569413244724274, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1011.0245971679688, "completions/mean_terminated_length": 831.8612670898438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4366309733098929, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1434211598673814, "kl": 0.018829345703125, "learning_rate": 7.239457510401106e-07, "loss": 0.1329, "num_tokens": 1181549443.0, "reward": 1.4715402126312256, "reward_std": 0.33091679215431213, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.19411711394786835, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 859.1295166015625, "completions/mean_terminated_length": 729.6484985351562, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4368440679771989, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1208979635298343, "kl": 0.0218505859375, "learning_rate": 7.236369582968981e-07, "loss": 0.071, "num_tokens": 1181994749.0, "reward": 1.6668527126312256, "reward_std": 0.2530975043773651, "rewards/accuracy_reward/mean": 0.7361111044883728, "rewards/accuracy_reward/std": 0.4412507712841034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15224164724349976, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1162.71435546875, "completions/mean_terminated_length": 874.6035766601562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.43705716264450484, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1178898340547743, "kl": 0.0163116455078125, "learning_rate": 7.233280694589202e-07, "loss": 0.0827, "num_tokens": 1182589277.0, "reward": 1.4525669813156128, "reward_std": 0.33481213450431824, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19445767998695374, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1042.4085693359375, "completions/mean_terminated_length": 806.9393920898438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4372702573118108, "frac_reward_zero_std": 0.0, "grad_norm": 0.13246008378885643, "kl": 0.01922607421875, "learning_rate": 7.230190846971229e-07, "loss": 0.0997, "num_tokens": 1183123860.0, "reward": 1.4319196939468384, "reward_std": 0.378562867641449, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18263639509677887, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1119.8326416015625, "completions/mean_terminated_length": 866.696044921875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4374833519791167, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13260994669542384, "kl": 0.017608642578125, "learning_rate": 7.227100041825057e-07, "loss": 0.0733, "num_tokens": 1183702585.0, "reward": 1.3856027126312256, "reward_std": 0.2696685194969177, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.19084173440933228, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1015.7879638671875, "completions/mean_terminated_length": 818.1303100585938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.43769644664642265, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1337741084175181, "kl": 0.0203857421875, "learning_rate": 7.22400828086121e-07, "loss": 0.0901, "num_tokens": 1184224266.0, "reward": 1.4988839626312256, "reward_std": 0.3660476505756378, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9386160969734192, "rewards/tag_count_reward/std": 0.18955937027931213, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1102.04248046875, "completions/mean_terminated_length": 857.5814819335938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4379095413137286, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12222667940818496, "kl": 0.01629638671875, "learning_rate": 7.220915565790742e-07, "loss": 0.0558, "num_tokens": 1184787341.0, "reward": 1.5195313692092896, "reward_std": 0.3279862403869629, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.13563236594200134, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 925.1652221679688, "completions/mean_terminated_length": 799.78662109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.43812263598103457, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14424101838351153, "kl": 0.021728515625, "learning_rate": 7.217821898325234e-07, "loss": 0.0903, "num_tokens": 1185267255.0, "reward": 1.5457589626312256, "reward_std": 0.2965417802333832, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.16277234256267548, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 949.169677734375, "completions/mean_terminated_length": 808.0100708007812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4383357306483405, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13055713084509007, "kl": 0.020538330078125, "learning_rate": 7.21472728017679e-07, "loss": 0.0686, "num_tokens": 1185759203.0, "reward": 1.6322544813156128, "reward_std": 0.2778674066066742, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.15070870518684387, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1055.7857666015625, "completions/mean_terminated_length": 830.158935546875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4385488253156465, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11103727218472582, "kl": 0.017120361328125, "learning_rate": 7.211631713058049e-07, "loss": 0.0714, "num_tokens": 1186300243.0, "reward": 1.5145089626312256, "reward_std": 0.3523017466068268, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17146752774715424, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 898.9285888671875, "completions/mean_terminated_length": 780.05908203125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.43876191998295244, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12508904020978298, "kl": 0.022247314453125, "learning_rate": 7.20853519868217e-07, "loss": 0.0208, "num_tokens": 1186773507.0, "reward": 1.5719866752624512, "reward_std": 0.3009365499019623, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.14503538608551025, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1048.169677734375, "completions/mean_terminated_length": 856.7127685546875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4389750146502584, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1294436413958662, "kl": 0.02056884765625, "learning_rate": 7.205437738762835e-07, "loss": 0.0582, "num_tokens": 1187315263.0, "reward": 1.4575893878936768, "reward_std": 0.3215850591659546, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.1791059374809265, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1130.388427734375, "completions/mean_terminated_length": 859.8786010742188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4391881093175643, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12964819241148098, "kl": 0.0167236328125, "learning_rate": 7.202339335014253e-07, "loss": 0.074, "num_tokens": 1187893517.0, "reward": 1.4531251192092896, "reward_std": 0.3764910101890564, "rewards/accuracy_reward/mean": 0.5162037014961243, "rewards/accuracy_reward/std": 0.5003167986869812, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1762816607952118, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 946.72998046875, "completions/mean_terminated_length": 782.9512939453125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.43940120398487026, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11200487028017626, "kl": 0.0183563232421875, "learning_rate": 7.199239989151151e-07, "loss": 0.0422, "num_tokens": 1188385524.0, "reward": 1.575334906578064, "reward_std": 0.27904102206230164, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1304718255996704, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 975.0670166015625, "completions/mean_terminated_length": 809.1494750976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4396142986521762, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11965050224347294, "kl": 0.018798828125, "learning_rate": 7.196139702888781e-07, "loss": 0.0424, "num_tokens": 1188890066.0, "reward": 1.6082589626312256, "reward_std": 0.376095712184906, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15465489029884338, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 965.8504638671875, "completions/mean_terminated_length": 775.5512084960938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4398273933194822, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1319501482628102, "kl": 0.01934814453125, "learning_rate": 7.193038477942912e-07, "loss": 0.0772, "num_tokens": 1189387231.0, "reward": 1.5820313692092896, "reward_std": 0.29898250102996826, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.1453535109758377, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 996.5826416015625, "completions/mean_terminated_length": 805.1636352539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44004048798678813, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13854250302472904, "kl": 0.019134521484375, "learning_rate": 7.189936316029839e-07, "loss": 0.0832, "num_tokens": 1189901076.0, "reward": 1.532366156578064, "reward_std": 0.2984950542449951, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17314842343330383, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1034.0045166015625, "completions/mean_terminated_length": 839.8350830078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4402535826540941, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13100043858490118, "kl": 0.018463134765625, "learning_rate": 7.186833218866367e-07, "loss": 0.0813, "num_tokens": 1190434566.0, "reward": 1.5273438692092896, "reward_std": 0.3369886577129364, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.19120772182941437, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 925.1719360351562, "completions/mean_terminated_length": 784.113037109375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.44046667732140005, "frac_reward_zero_std": 0.0, "grad_norm": 0.14068999215594574, "kl": 0.019775390625, "learning_rate": 7.183729188169825e-07, "loss": 0.0508, "num_tokens": 1190917651.0, "reward": 1.5452009439468384, "reward_std": 0.3048968017101288, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.14560237526893616, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1011.5201416015625, "completions/mean_terminated_length": 857.376953125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.440679771988706, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12621636009318526, "kl": 0.01812744140625, "learning_rate": 7.180624225658057e-07, "loss": 0.0823, "num_tokens": 1191445756.0, "reward": 1.4743304252624512, "reward_std": 0.34153011441230774, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18195155262947083, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 935.3772583007812, "completions/mean_terminated_length": 792.44580078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4408928666560119, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15364702306585823, "kl": 0.02020263671875, "learning_rate": 7.17751833304942e-07, "loss": 0.0613, "num_tokens": 1191928821.0, "reward": 1.5976563692092896, "reward_std": 0.2924443781375885, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.16456767916679382, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 920.247802734375, "completions/mean_terminated_length": 797.4232788085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44110596132331786, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13565063109202755, "kl": 0.022186279296875, "learning_rate": 7.174411512062789e-07, "loss": 0.0989, "num_tokens": 1192404868.0, "reward": 1.6171876192092896, "reward_std": 0.2753956615924835, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16662296652793884, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 966.79248046875, "completions/mean_terminated_length": 809.1738891601562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4413190559906238, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13827367269633276, "kl": 0.020416259765625, "learning_rate": 7.171303764417552e-07, "loss": 0.1372, "num_tokens": 1192908455.0, "reward": 1.4977679252624512, "reward_std": 0.39043644070625305, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.18991795182228088, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1043.8304443359375, "completions/mean_terminated_length": 801.8282470703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4415321506579298, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12186395133538762, "kl": 0.018402099609375, "learning_rate": 7.168195091833605e-07, "loss": 0.0971, "num_tokens": 1193438331.0, "reward": 1.477678656578064, "reward_std": 0.347394198179245, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1853403002023697, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 985.7857666015625, "completions/mean_terminated_length": 733.4364624023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44174524532523574, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12946785140580697, "kl": 0.019256591796875, "learning_rate": 7.165085496031368e-07, "loss": 0.0146, "num_tokens": 1193949067.0, "reward": 1.5306919813156128, "reward_std": 0.270818829536438, "rewards/accuracy_reward/mean": 0.5949074029922485, "rewards/accuracy_reward/std": 0.49147912859916687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1540675312280655, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 993.654052734375, "completions/mean_terminated_length": 801.701904296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4419583399925417, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13845541621618165, "kl": 0.0198974609375, "learning_rate": 7.161974978731759e-07, "loss": 0.0772, "num_tokens": 1194456992.0, "reward": 1.5340402126312256, "reward_std": 0.33164268732070923, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.19119465351104736, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1040.1763916015625, "completions/mean_terminated_length": 850.3739624023438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.44217143465984765, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11387679921948027, "kl": 0.019378662109375, "learning_rate": 7.158863541656214e-07, "loss": 0.0344, "num_tokens": 1194986127.0, "reward": 1.4793527126312256, "reward_std": 0.3044973909854889, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15676647424697876, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 941.6741333007812, "completions/mean_terminated_length": 818.1389770507812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4423845293271536, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13489095553891556, "kl": 0.021697998046875, "learning_rate": 7.155751186526673e-07, "loss": 0.0716, "num_tokens": 1195478269.0, "reward": 1.6082589626312256, "reward_std": 0.3194030523300171, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.18955937027931213, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 994.5938110351562, "completions/mean_terminated_length": 847.1704711914062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.44259762399445957, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3116568397646185, "kl": 0.022247314453125, "learning_rate": 7.152637915065585e-07, "loss": 0.0251, "num_tokens": 1195998231.0, "reward": 1.4821429252624512, "reward_std": 0.25211286544799805, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.158249631524086, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1002.747802734375, "completions/mean_terminated_length": 837.9922485351562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.44281071866176547, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13330991813049012, "kl": 0.019134521484375, "learning_rate": 7.149523728995913e-07, "loss": 0.0509, "num_tokens": 1196514918.0, "reward": 1.5418527126312256, "reward_std": 0.2718406915664673, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.15814605355262756, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1001.638427734375, "completions/mean_terminated_length": 767.2076416015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4430238133290714, "frac_reward_zero_std": 0.0, "grad_norm": 0.13489759213789615, "kl": 0.020843505859375, "learning_rate": 7.146408630041116e-07, "loss": 0.0322, "num_tokens": 1197031412.0, "reward": 1.5669643878936768, "reward_std": 0.2913854718208313, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14875037968158722, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1172.4554443359375, "completions/mean_terminated_length": 887.5148315429688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4432369079963774, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11855766035925569, "kl": 0.0160980224609375, "learning_rate": 7.143292619925164e-07, "loss": 0.063, "num_tokens": 1197630752.0, "reward": 1.3158482313156128, "reward_std": 0.3204297423362732, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.21593762934207916, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 992.2410888671875, "completions/mean_terminated_length": 853.6060791015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.44345000266368334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371884321618812, "kl": 0.02081298828125, "learning_rate": 7.14017570037253e-07, "loss": 0.0566, "num_tokens": 1198139740.0, "reward": 1.473772406578064, "reward_std": 0.32192912697792053, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16576191782951355, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1110.9910888671875, "completions/mean_terminated_length": 901.06005859375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4436630973309893, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1288664620879469, "kl": 0.0181884765625, "learning_rate": 7.137057873108192e-07, "loss": 0.0753, "num_tokens": 1198710408.0, "reward": 1.5033482313156128, "reward_std": 0.33159375190734863, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19537115097045898, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1164.009033203125, "completions/mean_terminated_length": 1003.0712890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44387619199829526, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11183260971800611, "kl": 0.015350341796875, "learning_rate": 7.133939139857625e-07, "loss": 0.0564, "num_tokens": 1199306652.0, "reward": 1.4910714626312256, "reward_std": 0.3213631212711334, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.17503081262111664, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1123.4442138671875, "completions/mean_terminated_length": 894.2367553710938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4440892866656012, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12110334897709045, "kl": 0.0189208984375, "learning_rate": 7.130819502346813e-07, "loss": 0.0564, "num_tokens": 1199877347.0, "reward": 1.4754464626312256, "reward_std": 0.3657534718513489, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8973214030265808, "rewards/tag_count_reward/std": 0.24532216787338257, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1111.03125, "completions/mean_terminated_length": 907.3424072265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44430238133290717, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1268828164329816, "kl": 0.018157958984375, "learning_rate": 7.127698962302234e-07, "loss": 0.0879, "num_tokens": 1200448241.0, "reward": 1.4084821939468384, "reward_std": 0.2666908800601959, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.17030231654644012, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1065.0023193359375, "completions/mean_terminated_length": 831.472412109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4445154760002131, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13311921114544228, "kl": 0.019378662109375, "learning_rate": 7.124577521450871e-07, "loss": 0.0708, "num_tokens": 1200985794.0, "reward": 1.4838169813156128, "reward_std": 0.3126163184642792, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9280133843421936, "rewards/tag_count_reward/std": 0.21734584867954254, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1008.7076416015625, "completions/mean_terminated_length": 803.0722045898438, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.44472857066751903, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1314362050223909, "kl": 0.019500732421875, "learning_rate": 7.121455181520199e-07, "loss": 0.0777, "num_tokens": 1201505615.0, "reward": 1.5044643878936768, "reward_std": 0.3508971035480499, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.008928571827709675, "rewards/format_reward/std": 0.09417349100112915, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.17533011734485626, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1229.3773193359375, "completions/mean_terminated_length": 909.046630859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.444941665334825, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11436702343252982, "kl": 0.015777587890625, "learning_rate": 7.118331944238196e-07, "loss": 0.0475, "num_tokens": 1202123672.0, "reward": 1.3694196939468384, "reward_std": 0.33544400334358215, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23562191426753998, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1111.9129638671875, "completions/mean_terminated_length": 908.415771484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.44515476000213094, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11709825199760196, "kl": 0.018524169921875, "learning_rate": 7.115207811333335e-07, "loss": 0.0724, "num_tokens": 1202689249.0, "reward": 1.4263393878936768, "reward_std": 0.3321261405944824, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.17612579464912415, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1120.2098388671875, "completions/mean_terminated_length": 853.6034545898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4453678546694369, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13520223058296615, "kl": 0.0177001953125, "learning_rate": 7.112082784534585e-07, "loss": 0.105, "num_tokens": 1203266719.0, "reward": 1.3286831378936768, "reward_std": 0.3107526898384094, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22645428776741028, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1009.966552734375, "completions/mean_terminated_length": 864.6946411132812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.44558094933674286, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11838570950087995, "kl": 0.018798828125, "learning_rate": 7.108956865571408e-07, "loss": 0.0713, "num_tokens": 1203787872.0, "reward": 1.5976563692092896, "reward_std": 0.3133440613746643, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17458714544773102, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1026.93310546875, "completions/mean_terminated_length": 831.4095458984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4457940440040488, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1483449647535498, "kl": 0.0191650390625, "learning_rate": 7.10583005617376e-07, "loss": 0.1028, "num_tokens": 1204314178.0, "reward": 1.5100446939468384, "reward_std": 0.28503119945526123, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.19173339009284973, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1094.024658203125, "completions/mean_terminated_length": 899.1263427734375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4460071386713548, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.5877693997123878, "kl": 0.03076171875, "learning_rate": 7.10270235807209e-07, "loss": 0.0881, "num_tokens": 1204880909.0, "reward": 1.4670759439468384, "reward_std": 0.3199712038040161, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.14945265650749207, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 984.7656860351562, "completions/mean_terminated_length": 801.0654296875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.4462202333386607, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12127665779475828, "kl": 0.018402099609375, "learning_rate": 7.099573772997344e-07, "loss": 0.0552, "num_tokens": 1205392820.0, "reward": 1.5373884439468384, "reward_std": 0.3277756869792938, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.158248633146286, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1054.83935546875, "completions/mean_terminated_length": 815.4902954101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.44643332800596663, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12048358979440574, "kl": 0.0168914794921875, "learning_rate": 7.096444302680951e-07, "loss": 0.0474, "num_tokens": 1205938412.0, "reward": 1.454241156578064, "reward_std": 0.337785929441452, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.16701209545135498, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1009.5870971679688, "completions/mean_terminated_length": 790.6784057617188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4466464226732726, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12170217902487429, "kl": 0.0174560546875, "learning_rate": 7.093313948854834e-07, "loss": 0.0649, "num_tokens": 1206455779.0, "reward": 1.5697544813156128, "reward_std": 0.3142164945602417, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.17088682949543, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1025.5960693359375, "completions/mean_terminated_length": 826.5679931640625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.44685951734057855, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12604881065521792, "kl": 0.018798828125, "learning_rate": 7.090182713251404e-07, "loss": 0.06, "num_tokens": 1206979406.0, "reward": 1.4720982313156128, "reward_std": 0.3795251250267029, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1795479953289032, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 954.6942138671875, "completions/mean_terminated_length": 759.050048828125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4470726120078845, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14578435836525083, "kl": 0.021575927734375, "learning_rate": 7.08705059760356e-07, "loss": 0.1025, "num_tokens": 1207480229.0, "reward": 1.540178656578064, "reward_std": 0.35240188241004944, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.008928571827709675, "rewards/format_reward/std": 0.09417349100112915, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.15763309597969055, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 980.5022583007812, "completions/mean_terminated_length": 805.8207397460938, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.44728570667519046, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13672660625571847, "kl": 0.018768310546875, "learning_rate": 7.083917603644688e-07, "loss": 0.0618, "num_tokens": 1207989734.0, "reward": 1.5703126192092896, "reward_std": 0.2972128987312317, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17375299334526062, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1044.1629638671875, "completions/mean_terminated_length": 839.0779418945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4474988013424964, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.39424230950159644, "kl": 0.0173797607421875, "learning_rate": 7.08078373310866e-07, "loss": 0.0942, "num_tokens": 1208530639.0, "reward": 1.3482143878936768, "reward_std": 0.3589441180229187, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9241071343421936, "rewards/tag_count_reward/std": 0.23192918300628662, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 906.9933471679688, "completions/mean_terminated_length": 753.8961791992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4477118960098024, "frac_reward_zero_std": 0.0, "grad_norm": 0.7440586227927102, "kl": 0.053009033203125, "learning_rate": 7.077648987729837e-07, "loss": 0.1303, "num_tokens": 1209006060.0, "reward": 1.4843751192092896, "reward_std": 0.377036988735199, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23015688359737396, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 993.97998046875, "completions/mean_terminated_length": 805.3658447265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4479249906771083, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13184477168399156, "kl": 0.019744873046875, "learning_rate": 7.074513369243056e-07, "loss": 0.0945, "num_tokens": 1209524547.0, "reward": 1.5234376192092896, "reward_std": 0.41563335061073303, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9252232313156128, "rewards/tag_count_reward/std": 0.21083608269691467, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 925.263427734375, "completions/mean_terminated_length": 777.8333129882812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44813808534441424, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1505201871754739, "kl": 0.020660400390625, "learning_rate": 7.071376879383647e-07, "loss": 0.0833, "num_tokens": 1210007881.0, "reward": 1.4760044813156128, "reward_std": 0.31187623739242554, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995505809783936, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1533121019601822, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1011.9844360351562, "completions/mean_terminated_length": 836.1593017578125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4483511800117202, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13061578238488283, "kl": 0.020660400390625, "learning_rate": 7.068239519887411e-07, "loss": 0.0705, "num_tokens": 1210530258.0, "reward": 1.5468751192092896, "reward_std": 0.3386935591697693, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.18765640258789062, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1115.29248046875, "completions/mean_terminated_length": 896.8898315429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.44856427467902615, "frac_reward_zero_std": 0.0, "grad_norm": 0.12882815987970697, "kl": 0.018402099609375, "learning_rate": 7.065101292490639e-07, "loss": 0.0733, "num_tokens": 1211098053.0, "reward": 1.4715402126312256, "reward_std": 0.39873239398002625, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.22931937873363495, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1096.5692138671875, "completions/mean_terminated_length": 911.3572998046875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4487773693463321, "frac_reward_zero_std": 0.0, "grad_norm": 0.1192902968973232, "kl": 0.018310546875, "learning_rate": 7.061962198930102e-07, "loss": 0.0684, "num_tokens": 1211661124.0, "reward": 1.4771206378936768, "reward_std": 0.3917034864425659, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9056919813156128, "rewards/tag_count_reward/std": 0.23448732495307922, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1025.53125, "completions/mean_terminated_length": 809.9838256835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44899046401363807, "frac_reward_zero_std": 0.0, "grad_norm": 0.13020058997101633, "kl": 0.018798828125, "learning_rate": 7.058822240943044e-07, "loss": 0.1142, "num_tokens": 1212191330.0, "reward": 1.4492188692092896, "reward_std": 0.34832221269607544, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9157366156578064, "rewards/tag_count_reward/std": 0.22931937873363495, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 949.997802734375, "completions/mean_terminated_length": 760.2905883789062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.449203558680944, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14340437507437326, "kl": 0.021240234375, "learning_rate": 7.055681420267196e-07, "loss": 0.0867, "num_tokens": 1212683425.0, "reward": 1.4760044813156128, "reward_std": 0.4063599705696106, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.23134104907512665, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1002.0402221679688, "completions/mean_terminated_length": 798.4266357421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.44941665334825, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13909452443393927, "kl": 0.022216796875, "learning_rate": 7.052539738640757e-07, "loss": 0.0767, "num_tokens": 1213200563.0, "reward": 1.4614956378936768, "reward_std": 0.30341097712516785, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.1856423318386078, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1066.9285888671875, "completions/mean_terminated_length": 850.3978271484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4496297480155559, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.5242757140992893, "kl": 0.019989013671875, "learning_rate": 7.049397197802408e-07, "loss": 0.133, "num_tokens": 1213748995.0, "reward": 1.3906251192092896, "reward_std": 0.3241032063961029, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.1884000152349472, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 867.1719360351562, "completions/mean_terminated_length": 659.5196533203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.44984284268286184, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13811446746886555, "kl": 0.020538330078125, "learning_rate": 7.04625379949131e-07, "loss": 0.1121, "num_tokens": 1214212672.0, "reward": 1.594866156578064, "reward_std": 0.3024706244468689, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969305515289307, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17132186889648438, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 975.3795166015625, "completions/mean_terminated_length": 834.5302734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4500559373501678, "frac_reward_zero_std": 0.0, "grad_norm": 0.13469183827692344, "kl": 0.020263671875, "learning_rate": 7.043109545447087e-07, "loss": 0.0941, "num_tokens": 1214719066.0, "reward": 1.6395089626312256, "reward_std": 0.33141523599624634, "rewards/accuracy_reward/mean": 0.7075892686843872, "rewards/accuracy_reward/std": 0.4553784728050232, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.19385728240013123, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1035.6763916015625, "completions/mean_terminated_length": 812.2479248046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.45026903201747376, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12079807317986545, "kl": 0.01953125, "learning_rate": 7.039964437409844e-07, "loss": 0.0777, "num_tokens": 1215256905.0, "reward": 1.5055804252624512, "reward_std": 0.389077752828598, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20431670546531677, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1115.609375, "completions/mean_terminated_length": 931.1256713867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4504821266847797, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12347592414109657, "kl": 0.018707275390625, "learning_rate": 7.036818477120163e-07, "loss": 0.0731, "num_tokens": 1215824042.0, "reward": 1.4687501192092896, "reward_std": 0.351925253868103, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.18240725994110107, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1060.665283203125, "completions/mean_terminated_length": 862.1394653320312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4506952213520857, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1338831279336727, "kl": 0.0203857421875, "learning_rate": 7.033671666319085e-07, "loss": 0.0839, "num_tokens": 1216365668.0, "reward": 1.4944196939468384, "reward_std": 0.38135501742362976, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9341517686843872, "rewards/tag_count_reward/std": 0.19959399104118347, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1099.071533203125, "completions/mean_terminated_length": 867.1111450195312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.45090831601939163, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1436092920952251, "kl": 0.020538330078125, "learning_rate": 7.030524006748135e-07, "loss": 0.101, "num_tokens": 1216927588.0, "reward": 1.4241071939468384, "reward_std": 0.3675498366355896, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.20673725008964539, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1129.5491943359375, "completions/mean_terminated_length": 911.3536376953125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4511214106866976, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12107033314166665, "kl": 0.017791748046875, "learning_rate": 7.027375500149297e-07, "loss": 0.099, "num_tokens": 1217505306.0, "reward": 1.4051339626312256, "reward_std": 0.4194040596485138, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9162946343421936, "rewards/tag_count_reward/std": 0.21078869700431824, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1123.6875, "completions/mean_terminated_length": 952.5184936523438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4513345053540035, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1260000089118119, "kl": 0.0186767578125, "learning_rate": 7.024226148265032e-07, "loss": 0.1062, "num_tokens": 1218075294.0, "reward": 1.4921876192092896, "reward_std": 0.40922603011131287, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9095982313156128, "rewards/tag_count_reward/std": 0.22355039417743683, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1088.6451416015625, "completions/mean_terminated_length": 898.8262329101562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.45154760002130945, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11318952807575937, "kl": 0.0194091796875, "learning_rate": 7.021075952838262e-07, "loss": 0.0176, "num_tokens": 1218628735.0, "reward": 1.5016741752624512, "reward_std": 0.3029335141181946, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18560869991779327, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1002.7701416015625, "completions/mean_terminated_length": 792.603271484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4517606946886154, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13267192512029619, "kl": 0.0208740234375, "learning_rate": 7.017924915612381e-07, "loss": 0.0859, "num_tokens": 1219143912.0, "reward": 1.5273438692092896, "reward_std": 0.33173060417175293, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9313616156578064, "rewards/tag_count_reward/std": 0.1954566091299057, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1136.9263916015625, "completions/mean_terminated_length": 907.8854370117188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.45197378935592136, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.144570626548426, "kl": 0.01812744140625, "learning_rate": 7.014773038331247e-07, "loss": 0.1206, "num_tokens": 1219720903.0, "reward": 1.493303656578064, "reward_std": 0.42330852150917053, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9107142686843872, "rewards/tag_count_reward/std": 0.23136872053146362, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1104.6004638671875, "completions/mean_terminated_length": 877.2437133789062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4521868840232273, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1237456793686293, "kl": 0.0191650390625, "learning_rate": 7.011620322739183e-07, "loss": 0.0869, "num_tokens": 1220287428.0, "reward": 1.4129464626312256, "reward_std": 0.36484357714653015, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.2053801566362381, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1059.3148193359375, "completions/mean_terminated_length": 847.64501953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4523999786905333, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1254572340069438, "kl": 0.018341064453125, "learning_rate": 7.008466770580972e-07, "loss": 0.1142, "num_tokens": 1220834225.0, "reward": 1.4927456378936768, "reward_std": 0.3333226144313812, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.1951945573091507, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1140.078125, "completions/mean_terminated_length": 895.7365112304688, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.45261307335783924, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11762407525780962, "kl": 0.016754150390625, "learning_rate": 7.005312383601869e-07, "loss": 0.1207, "num_tokens": 1221415252.0, "reward": 1.4040179252624512, "reward_std": 0.3198819160461426, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.18765640258789062, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1041.7366943359375, "completions/mean_terminated_length": 832.8894653320312, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4528261680251452, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13560181056531387, "kl": 0.02166748046875, "learning_rate": 7.002157163547583e-07, "loss": 0.0721, "num_tokens": 1221946702.0, "reward": 1.6021206378936768, "reward_std": 0.32840245962142944, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18103241920471191, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1144.743408203125, "completions/mean_terminated_length": 881.835693359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4530392626924511, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12435096100083422, "kl": 0.018402099609375, "learning_rate": 6.999001112164288e-07, "loss": 0.0844, "num_tokens": 1222527899.0, "reward": 1.4531251192092896, "reward_std": 0.3638610243797302, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19418221712112427, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 961.4553833007812, "completions/mean_terminated_length": 828.02001953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.45325235735975705, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12710405916485282, "kl": 0.02288818359375, "learning_rate": 6.995844231198616e-07, "loss": 0.0502, "num_tokens": 1223028551.0, "reward": 1.6623884439468384, "reward_std": 0.35487285256385803, "rewards/accuracy_reward/mean": 0.7075892686843872, "rewards/accuracy_reward/std": 0.4553784728050232, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.16183683276176453, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1027.7991943359375, "completions/mean_terminated_length": 802.6321411132812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.453465452027063, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13059197610576262, "kl": 0.021148681640625, "learning_rate": 6.992686522397658e-07, "loss": 0.1252, "num_tokens": 1223560781.0, "reward": 1.5122768878936768, "reward_std": 0.40864914655685425, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.2057536393404007, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 960.9420166015625, "completions/mean_terminated_length": 824.3768920898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.45367854669436897, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1341443597428575, "kl": 0.019683837890625, "learning_rate": 6.989527987508966e-07, "loss": 0.118, "num_tokens": 1224068115.0, "reward": 1.6434152126312256, "reward_std": 0.2862638831138611, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13073945045471191, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1269.935302734375, "completions/mean_terminated_length": 998.081298828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4538916413616749, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.09929585381278933, "kl": 0.015869140625, "learning_rate": 6.986368628280547e-07, "loss": 0.0768, "num_tokens": 1224708054.0, "reward": 1.356584906578064, "reward_std": 0.39402303099632263, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9302455186843872, "rewards/tag_count_reward/std": 0.2075621336698532, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1052.9598388671875, "completions/mean_terminated_length": 813.1578979492188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4541047360289809, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12388351827629222, "kl": 0.019134521484375, "learning_rate": 6.983208446460863e-07, "loss": 0.0427, "num_tokens": 1225247524.0, "reward": 1.5000001192092896, "reward_std": 0.3233374357223511, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12415824085474014, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.14943698048591614, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 996.8035888671875, "completions/mean_terminated_length": 788.8128662109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.45431783069628684, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1281292900187781, "kl": 0.021148681640625, "learning_rate": 6.980047443798835e-07, "loss": 0.0451, "num_tokens": 1225768668.0, "reward": 1.4436384439468384, "reward_std": 0.2854624092578888, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16228362917900085, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1016.5000610351562, "completions/mean_terminated_length": 818.9786987304688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4545309253635928, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11896062847183478, "kl": 0.018768310546875, "learning_rate": 6.976885622043836e-07, "loss": 0.0581, "num_tokens": 1226292604.0, "reward": 1.5212054252624512, "reward_std": 0.3162801265716553, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1849188506603241, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1236.4241943359375, "completions/mean_terminated_length": 901.041015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4547440200308987, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11863223305708431, "kl": 0.0163116455078125, "learning_rate": 6.973722982945688e-07, "loss": 0.0504, "num_tokens": 1226918906.0, "reward": 1.407366156578064, "reward_std": 0.3814505934715271, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18618372082710266, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1013.9375610351562, "completions/mean_terminated_length": 815.9254760742188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.45495711469820466, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13058947363371667, "kl": 0.0185546875, "learning_rate": 6.970559528254674e-07, "loss": 0.065, "num_tokens": 1227445550.0, "reward": 1.4938616752624512, "reward_std": 0.3140023350715637, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.01116071455180645, "rewards/format_reward/std": 0.10517053306102753, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1503853052854538, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1101.4576416015625, "completions/mean_terminated_length": 860.1820678710938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.4551702093655106, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12247783944159751, "kl": 0.017578125, "learning_rate": 6.967395259721523e-07, "loss": 0.1107, "num_tokens": 1228011579.0, "reward": 1.4481027126312256, "reward_std": 0.3940381407737732, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1929948478937149, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1016.2410888671875, "completions/mean_terminated_length": 837.9790649414062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.45538330403281657, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11448485578315898, "kl": 0.01849365234375, "learning_rate": 6.964230179097414e-07, "loss": 0.0808, "num_tokens": 1228528951.0, "reward": 1.4720982313156128, "reward_std": 0.26490485668182373, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15099525451660156, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1049.85498046875, "completions/mean_terminated_length": 849.155517578125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.45559639870012253, "frac_reward_zero_std": 0.0, "grad_norm": 0.13943064016708315, "kl": 0.01898193359375, "learning_rate": 6.96106428813398e-07, "loss": 0.1359, "num_tokens": 1229071862.0, "reward": 1.540178656578064, "reward_std": 0.3550258278846741, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.1970413774251938, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 948.7188110351562, "completions/mean_terminated_length": 768.8363647460938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4558094933674285, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14212882734125562, "kl": 0.022186279296875, "learning_rate": 6.957897588583298e-07, "loss": 0.082, "num_tokens": 1229556872.0, "reward": 1.6043527126312256, "reward_std": 0.3201252520084381, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.17706511914730072, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1111.290283203125, "completions/mean_terminated_length": 888.7569580078125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.45602258803473444, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12307676872354202, "kl": 0.01739501953125, "learning_rate": 6.954730082197891e-07, "loss": 0.0911, "num_tokens": 1230129178.0, "reward": 1.5055804252624512, "reward_std": 0.39122167229652405, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.013392857275903225, "rewards/format_reward/std": 0.11507843434810638, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17773102223873138, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 916.7098388671875, "completions/mean_terminated_length": 689.2386474609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4562356827020404, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1402390928030813, "kl": 0.018890380859375, "learning_rate": 6.951561770730736e-07, "loss": 0.0944, "num_tokens": 1230608680.0, "reward": 1.5742188692092896, "reward_std": 0.3198275566101074, "rewards/accuracy_reward/mean": 0.6481481194496155, "rewards/accuracy_reward/std": 0.4781017005443573, "rewards/format_reward/mean": 0.008928571827709675, "rewards/format_reward/std": 0.09417349100112915, "rewards/tag_count_reward/mean": 0.9402901530265808, "rewards/tag_count_reward/std": 0.18750248849391937, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1068.5848388671875, "completions/mean_terminated_length": 871.6514892578125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.45644877736934636, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13112911467592722, "kl": 0.0213623046875, "learning_rate": 6.948392655935247e-07, "loss": 0.0739, "num_tokens": 1231155182.0, "reward": 1.5725446939468384, "reward_std": 0.3506115674972534, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.14909827709197998, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1123.7857666015625, "completions/mean_terminated_length": 868.3760375976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.45666187203665226, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.17295181095191361, "kl": 0.019500732421875, "learning_rate": 6.945222739565288e-07, "loss": 0.0433, "num_tokens": 1231731470.0, "reward": 1.3364956378936768, "reward_std": 0.3273019790649414, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.01116071455180645, "rewards/format_reward/std": 0.10517053306102753, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.1753861904144287, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1133.52685546875, "completions/mean_terminated_length": 887.4220581054688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4568749667039582, "frac_reward_zero_std": 0.0, "grad_norm": 0.12518814156220218, "kl": 0.019012451171875, "learning_rate": 6.942052023375166e-07, "loss": 0.0843, "num_tokens": 1232303114.0, "reward": 1.4162946939468384, "reward_std": 0.3659404516220093, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0223214291036129, "rewards/format_reward/std": 0.14789186418056488, "rewards/tag_count_reward/mean": 0.9319196343421936, "rewards/tag_count_reward/std": 0.20024341344833374, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1081.477783203125, "completions/mean_terminated_length": 810.8514404296875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4570880613712642, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11881452334494458, "kl": 0.0169525146484375, "learning_rate": 6.938880509119628e-07, "loss": 0.1105, "num_tokens": 1232857488.0, "reward": 1.4196429252624512, "reward_std": 0.3262912929058075, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.004464285913854837, "rewards/format_reward/std": 0.06674052774906158, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.18540765345096588, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1039.9263916015625, "completions/mean_terminated_length": 807.2940063476562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.45730115603857013, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13260945796253176, "kl": 0.019500732421875, "learning_rate": 6.935708198553864e-07, "loss": 0.0536, "num_tokens": 1233394015.0, "reward": 1.477678656578064, "reward_std": 0.34392327070236206, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.17112135887145996, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 931.1875610351562, "completions/mean_terminated_length": 741.650146484375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4575142507058761, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14432476253453902, "kl": 0.02081298828125, "learning_rate": 6.932535093433509e-07, "loss": 0.0704, "num_tokens": 1233871363.0, "reward": 1.6858259439468384, "reward_std": 0.24119317531585693, "rewards/accuracy_reward/mean": 0.7209821343421936, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.13939984142780304, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1076.790283203125, "completions/mean_terminated_length": 862.4359741210938, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.45772734537318205, "frac_reward_zero_std": 0.0, "grad_norm": 0.13118731474571235, "kl": 0.017669677734375, "learning_rate": 6.929361195514628e-07, "loss": 0.0882, "num_tokens": 1234418821.0, "reward": 1.5937501192092896, "reward_std": 0.3818773925304413, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15736359357833862, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 978.607177734375, "completions/mean_terminated_length": 850.2799682617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.457940440040488, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13992743132810606, "kl": 0.02081298828125, "learning_rate": 6.926186506553735e-07, "loss": 0.0729, "num_tokens": 1234923605.0, "reward": 1.5312501192092896, "reward_std": 0.29126328229904175, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.0022321429569274187, "rewards/format_reward/std": 0.047245558351278305, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1632968634366989, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1027.290283203125, "completions/mean_terminated_length": 818.758056640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.45815353470779396, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12863829285221157, "kl": 0.019805908203125, "learning_rate": 6.923011028307776e-07, "loss": 0.0959, "num_tokens": 1235453863.0, "reward": 1.555803656578064, "reward_std": 0.3804105520248413, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.02008928544819355, "rewards/format_reward/std": 0.14046262204647064, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19212691485881805, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1130.33935546875, "completions/mean_terminated_length": 899.6424560546875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.45836662937509987, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11561555312554109, "kl": 0.016754150390625, "learning_rate": 6.919834762534136e-07, "loss": 0.0529, "num_tokens": 1236035551.0, "reward": 1.3789063692092896, "reward_std": 0.31697165966033936, "rewards/accuracy_reward/mean": 0.44212964177131653, "rewards/accuracy_reward/std": 0.4972155690193176, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.1761815994977951, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1011.7991333007812, "completions/mean_terminated_length": 848.4703369140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4585797240424058, "frac_reward_zero_std": 0.0, "grad_norm": 0.12841370085015966, "kl": 0.019378662109375, "learning_rate": 6.916657710990632e-07, "loss": 0.1079, "num_tokens": 1236554213.0, "reward": 1.587053656578064, "reward_std": 0.3580479025840759, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.01116071455180645, "rewards/format_reward/std": 0.10517053306102753, "rewards/tag_count_reward/mean": 0.9308035969734192, "rewards/tag_count_reward/std": 0.19490092992782593, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1083.49560546875, "completions/mean_terminated_length": 806.3390502929688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4587928187097118, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1292043593074205, "kl": 0.0205078125, "learning_rate": 6.913479875435521e-07, "loss": 0.0693, "num_tokens": 1237117763.0, "reward": 1.5000001192092896, "reward_std": 0.32636529207229614, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12415824085474014, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.1814328134059906, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1092.915283203125, "completions/mean_terminated_length": 849.4622192382812, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.45900591337701774, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11233016472911361, "kl": 0.017730712890625, "learning_rate": 6.910301257627493e-07, "loss": 0.0547, "num_tokens": 1237676317.0, "reward": 1.5078126192092896, "reward_std": 0.3548882305622101, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.0066964286379516125, "rewards/format_reward/std": 0.08164843916893005, "rewards/tag_count_reward/mean": 0.9274553656578064, "rewards/tag_count_reward/std": 0.2035330981016159, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1227.32373046875, "completions/mean_terminated_length": 923.6483764648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4592190080443237, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11216898283127252, "kl": 0.015960693359375, "learning_rate": 6.907121859325666e-07, "loss": 0.043, "num_tokens": 1238294526.0, "reward": 1.3510044813156128, "reward_std": 0.40313515067100525, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.02901785634458065, "rewards/format_reward/std": 0.16804419457912445, "rewards/tag_count_reward/mean": 0.9268973469734192, "rewards/tag_count_reward/std": 0.215030238032341, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1045.747802734375, "completions/mean_terminated_length": 831.1734619140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.45943210271162965, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15084060297496396, "kl": 0.0203857421875, "learning_rate": 6.903941682289598e-07, "loss": 0.0187, "num_tokens": 1238826493.0, "reward": 1.4168527126312256, "reward_std": 0.31627556681632996, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.013392857275903225, "rewards/format_reward/std": 0.11507843434810638, "rewards/tag_count_reward/mean": 0.9347098469734192, "rewards/tag_count_reward/std": 0.19009453058242798, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1096.46435546875, "completions/mean_terminated_length": 867.1467895507812, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4596451973789356, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13522629948295403, "kl": 0.01885986328125, "learning_rate": 6.900760728279272e-07, "loss": 0.084, "num_tokens": 1239391629.0, "reward": 1.4224331378936768, "reward_std": 0.3352315127849579, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.01785714365541935, "rewards/format_reward/std": 0.13258016109466553, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1904422789812088, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1047.4910888671875, "completions/mean_terminated_length": 829.9891357421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.45985829204624157, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13435680829720964, "kl": 0.020050048828125, "learning_rate": 6.8975789990551e-07, "loss": 0.0838, "num_tokens": 1239930521.0, "reward": 1.4760044813156128, "reward_std": 0.427381694316864, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.02901785634458065, "rewards/format_reward/std": 0.16804419457912445, "rewards/tag_count_reward/mean": 0.9202008843421936, "rewards/tag_count_reward/std": 0.2235301434993744, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1022.638427734375, "completions/mean_terminated_length": 799.7337036132812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.46007138671354747, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12259538793935482, "kl": 0.01934814453125, "learning_rate": 6.894396496377929e-07, "loss": 0.0447, "num_tokens": 1240456215.0, "reward": 1.602678656578064, "reward_std": 0.34144362807273865, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.0245535708963871, "rewards/format_reward/std": 0.1549331247806549, "rewards/tag_count_reward/mean": 0.9419642686843872, "rewards/tag_count_reward/std": 0.18690980970859528, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1072.6898193359375, "completions/mean_terminated_length": 850.9068603515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4602844813808534, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12594602883892275, "kl": 0.01708984375, "learning_rate": 6.891213222009029e-07, "loss": 0.1491, "num_tokens": 1241003948.0, "reward": 1.4665179252624512, "reward_std": 0.4376295804977417, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.0446428582072258, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.19785068929195404, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1062.8795166015625, "completions/mean_terminated_length": 877.352783203125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4604975760481594, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12824733230663907, "kl": 0.019561767578125, "learning_rate": 6.888029177710098e-07, "loss": 0.0591, "num_tokens": 1241543446.0, "reward": 1.5937501192092896, "reward_std": 0.41953450441360474, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.0357142873108387, "rewards/format_reward/std": 0.18578433990478516, "rewards/tag_count_reward/mean": 0.9330357313156128, "rewards/tag_count_reward/std": 0.19496497511863708, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1061.921875, "completions/mean_terminated_length": 860.4650268554688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.46071067071546534, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11619496010771999, "kl": 0.018218994140625, "learning_rate": 6.884844365243263e-07, "loss": 0.0535, "num_tokens": 1242080563.0, "reward": 1.4927456378936768, "reward_std": 0.35678109526634216, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.17418713867664337, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1515924483537674, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1146.2388916015625, "completions/mean_terminated_length": 859.7970581054688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4609237653827713, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12677080195602433, "kl": 0.0172271728515625, "learning_rate": 6.881658786371071e-07, "loss": 0.0889, "num_tokens": 1242669262.0, "reward": 1.3560268878936768, "reward_std": 0.4218686819076538, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.02008928544819355, "rewards/format_reward/std": 0.14046260714530945, "rewards/tag_count_reward/mean": 0.9229910969734192, "rewards/tag_count_reward/std": 0.2113564908504486, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1023.12060546875, "completions/mean_terminated_length": 813.736572265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.46113686005007726, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11948263972405707, "kl": 0.01934814453125, "learning_rate": 6.8784724428565e-07, "loss": 0.0415, "num_tokens": 1243196196.0, "reward": 1.6540179252624512, "reward_std": 0.35513198375701904, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.0379464291036129, "rewards/format_reward/std": 0.19128035008907318, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1700088232755661, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1121.638427734375, "completions/mean_terminated_length": 888.754150390625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.4613499547173832, "frac_reward_zero_std": 0.0, "grad_norm": 0.1253620262710644, "kl": 0.018798828125, "learning_rate": 6.875285336462942e-07, "loss": 0.133, "num_tokens": 1243769938.0, "reward": 1.547991156578064, "reward_std": 0.4231022000312805, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.0401785708963871, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.19677190482616425, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1020.779052734375, "completions/mean_terminated_length": 849.5755615234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4615630493846892, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1256639312291695, "kl": 0.019012451171875, "learning_rate": 6.872097468954222e-07, "loss": 0.0634, "num_tokens": 1244289743.0, "reward": 1.6590402126312256, "reward_std": 0.39027541875839233, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.0491071417927742, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.161489337682724, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1037.305908203125, "completions/mean_terminated_length": 834.0831298828125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4617761440519951, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13823255037735785, "kl": 0.020111083984375, "learning_rate": 6.868908842094577e-07, "loss": 0.147, "num_tokens": 1244823112.0, "reward": 1.5200893878936768, "reward_std": 0.5107468962669373, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.0691964253783226, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9151785969734192, "rewards/tag_count_reward/std": 0.23002667725086212, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1012.6094360351562, "completions/mean_terminated_length": 817.6153564453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46198923871930103, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1304483099112223, "kl": 0.020660400390625, "learning_rate": 6.865719457648668e-07, "loss": 0.1326, "num_tokens": 1245347689.0, "reward": 1.5641741752624512, "reward_std": 0.41145244240760803, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.0647321417927742, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9324776530265808, "rewards/tag_count_reward/std": 0.20491690933704376, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1115.5648193359375, "completions/mean_terminated_length": 881.1536254882812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.462202333386607, "frac_reward_zero_std": 0.0, "grad_norm": 0.13018325682987436, "kl": 0.01898193359375, "learning_rate": 6.862529317381578e-07, "loss": 0.0578, "num_tokens": 1245916358.0, "reward": 1.5178571939468384, "reward_std": 0.4868509769439697, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.1116071417927742, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1875898689031601, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1114.763427734375, "completions/mean_terminated_length": 870.2816772460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.46241542805391295, "frac_reward_zero_std": 0.0, "grad_norm": 0.2147844532380764, "kl": 0.019866943359375, "learning_rate": 6.859338423058802e-07, "loss": 0.123, "num_tokens": 1246487404.0, "reward": 1.5580357313156128, "reward_std": 0.4804766774177551, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.0915178582072258, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9285714030265808, "rewards/tag_count_reward/std": 0.21913130581378937, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1005.5670166015625, "completions/mean_terminated_length": 822.251953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4626285227212189, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12264472902243723, "kl": 0.021148681640625, "learning_rate": 6.856146776446258e-07, "loss": 0.0947, "num_tokens": 1247006666.0, "reward": 1.6121652126312256, "reward_std": 0.418075293302536, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.0803571417927742, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16744044423103333, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 963.5313110351562, "completions/mean_terminated_length": 830.3508911132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.46284161738852486, "frac_reward_zero_std": 0.0, "grad_norm": 0.13568384811676387, "kl": 0.02032470703125, "learning_rate": 6.852954379310276e-07, "loss": 0.1243, "num_tokens": 1247505096.0, "reward": 1.7226563692092896, "reward_std": 0.5068737268447876, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.1428571492433548, "rewards/format_reward/std": 0.3503182828426361, "rewards/tag_count_reward/mean": 0.9391741156578064, "rewards/tag_count_reward/std": 0.18260477483272552, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1175.368408203125, "completions/mean_terminated_length": 887.943603515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4630547120558308, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12012914069208075, "kl": 0.017059326171875, "learning_rate": 6.849761233417606e-07, "loss": 0.0527, "num_tokens": 1248103933.0, "reward": 1.5145089626312256, "reward_std": 0.5383719205856323, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "rewards/format_reward/mean": 0.2120535671710968, "rewards/format_reward/std": 0.40921956300735474, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.190032958984375, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1102.87060546875, "completions/mean_terminated_length": 848.5155639648438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4632678067231368, "frac_reward_zero_std": 0.0, "grad_norm": 0.1309608864990655, "kl": 0.0185546875, "learning_rate": 6.846567340535411e-07, "loss": 0.0883, "num_tokens": 1248674563.0, "reward": 1.6819196939468384, "reward_std": 0.5417090654373169, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.2544642984867096, "rewards/format_reward/std": 0.4360465705394745, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21302737295627594, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1068.890625, "completions/mean_terminated_length": 761.6627807617188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4634809013904427, "frac_reward_zero_std": 0.0, "grad_norm": 0.18373067984644076, "kl": 0.024200439453125, "learning_rate": 6.843372702431262e-07, "loss": 0.1114, "num_tokens": 1249228274.0, "reward": 1.6657366752624512, "reward_std": 0.563531756401062, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.2678571343421936, "rewards/format_reward/std": 0.4433377981185913, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.1896803379058838, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 905.7857666015625, "completions/mean_terminated_length": 715.4166870117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46369399605774864, "frac_reward_zero_std": 0.0, "grad_norm": 0.18641426264147656, "kl": 0.02899169921875, "learning_rate": 6.840177320873148e-07, "loss": 0.0986, "num_tokens": 1249707570.0, "reward": 1.7946429252624512, "reward_std": 0.5405769348144531, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.3683035671710968, "rewards/format_reward/std": 0.4828835427761078, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.14598877727985382, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 941.0558471679688, "completions/mean_terminated_length": 746.3963012695312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4639070907250546, "frac_reward_zero_std": 0.0, "grad_norm": 0.1544676366259497, "kl": 0.025360107421875, "learning_rate": 6.836981197629469e-07, "loss": 0.1289, "num_tokens": 1250197243.0, "reward": 1.9296876192092896, "reward_std": 0.6100171208381653, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.4944108724594116, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.19679729640483856, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 918.32373046875, "completions/mean_terminated_length": 740.260986328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46412018539236055, "frac_reward_zero_std": 0.0, "grad_norm": 6.134863106794164, "kl": 0.462158203125, "learning_rate": 6.833784334469034e-07, "loss": 0.1227, "num_tokens": 1250680748.0, "reward": 2.009486675262451, "reward_std": 0.595630943775177, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.5111607313156128, "rewards/format_reward/std": 0.5004342198371887, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.15670275688171387, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 772.2254638671875, "completions/mean_terminated_length": 633.2796630859375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4643332800596665, "frac_reward_zero_std": 0.0, "grad_norm": 0.1545240759018115, "kl": 0.03045654296875, "learning_rate": 6.830586733161063e-07, "loss": 0.0884, "num_tokens": 1251089537.0, "reward": 2.0792412757873535, "reward_std": 0.6041925549507141, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.5691964030265808, "rewards/format_reward/std": 0.4957422614097595, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.1879986971616745, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 883.5982666015625, "completions/mean_terminated_length": 693.0597534179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46454637472697247, "frac_reward_zero_std": 0.0, "grad_norm": 0.16020563431350518, "kl": 0.030059814453125, "learning_rate": 6.827388395475183e-07, "loss": 0.1212, "num_tokens": 1251550541.0, "reward": 2.2036831378936768, "reward_std": 0.6157270669937134, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.6361607313156128, "rewards/format_reward/std": 0.4816409945487976, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.17139743268489838, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 952.33935546875, "completions/mean_terminated_length": 732.0321655273438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4647594693942784, "frac_reward_zero_std": 0.0, "grad_norm": 0.14604092317056258, "kl": 0.026763916015625, "learning_rate": 6.824189323181429e-07, "loss": 0.1101, "num_tokens": 1252046517.0, "reward": 2.123326063156128, "reward_std": 0.5768536329269409, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.7477678656578064, "rewards/format_reward/std": 0.4347793161869049, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16430194675922394, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 968.18310546875, "completions/mean_terminated_length": 737.0027465820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4649725640615844, "frac_reward_zero_std": 0.0, "grad_norm": 0.13988484413635563, "kl": 0.02703857421875, "learning_rate": 6.820989518050244e-07, "loss": 0.1146, "num_tokens": 1252551079.0, "reward": 2.2098214626312256, "reward_std": 0.5962929725646973, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.7522321343421936, "rewards/format_reward/std": 0.4321989119052887, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18524597585201263, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 914.8594360351562, "completions/mean_terminated_length": 687.01611328125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4651856587288903, "frac_reward_zero_std": 0.0, "grad_norm": 0.1397153083992657, "kl": 0.028564453125, "learning_rate": 6.817788981852471e-07, "loss": 0.11, "num_tokens": 1253030984.0, "reward": 2.1863839626312256, "reward_std": 0.5598536729812622, "rewards/accuracy_reward/mean": 0.46990740299224854, "rewards/accuracy_reward/std": 0.4996722638607025, "rewards/format_reward/mean": 0.7767857313156128, "rewards/format_reward/std": 0.41686636209487915, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16740036010742188, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1006.060302734375, "completions/mean_terminated_length": 782.9891967773438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.46539875339619624, "frac_reward_zero_std": 0.0, "grad_norm": 0.14043072337143242, "kl": 0.026641845703125, "learning_rate": 6.814587716359366e-07, "loss": 0.0795, "num_tokens": 1253552691.0, "reward": 2.2310268878936768, "reward_std": 0.5390534400939941, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.8102678656578064, "rewards/format_reward/std": 0.39252743124961853, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.17929750680923462, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 960.5245971679688, "completions/mean_terminated_length": 741.86328125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4656118480635022, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12791565349488476, "kl": 0.0283203125, "learning_rate": 6.811385723342583e-07, "loss": 0.0547, "num_tokens": 1254051726.0, "reward": 2.213169813156128, "reward_std": 0.5340933203697205, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.8080357313156128, "rewards/format_reward/std": 0.3942854404449463, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16144777834415436, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 928.5870971679688, "completions/mean_terminated_length": 688.9295654296875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.46582494273080816, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14244584010572325, "kl": 0.027801513671875, "learning_rate": 6.808183004574181e-07, "loss": 0.05, "num_tokens": 1254536389.0, "reward": 2.2940850257873535, "reward_std": 0.4687335789203644, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.8727678656578064, "rewards/format_reward/std": 0.3336053788661957, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.17231273651123047, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 814.3995971679688, "completions/mean_terminated_length": 700.0658569335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4660380373981141, "frac_reward_zero_std": 0.0, "grad_norm": 0.13774832495507758, "kl": 0.03240966796875, "learning_rate": 6.804979561826618e-07, "loss": 0.0473, "num_tokens": 1254973576.0, "reward": 2.4056921005249023, "reward_std": 0.5147993564605713, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3480229377746582, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1706821471452713, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 722.8951416015625, "completions/mean_terminated_length": 631.181396484375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.46625113206542007, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14612963584527716, "kl": 0.0355224609375, "learning_rate": 6.801775396872757e-07, "loss": 0.0464, "num_tokens": 1255367545.0, "reward": 2.5401787757873535, "reward_std": 0.49817097187042236, "rewards/accuracy_reward/mean": 0.6852678656578064, "rewards/accuracy_reward/std": 0.4649282991886139, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 885.0223388671875, "completions/mean_terminated_length": 684.0889892578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46646422673272603, "frac_reward_zero_std": 0.0, "grad_norm": 0.15368045477938116, "kl": 0.02923583984375, "learning_rate": 6.798570511485854e-07, "loss": 0.1468, "num_tokens": 1255838931.0, "reward": 2.369419813156128, "reward_std": 0.48315125703811646, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18948033452033997, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 949.18310546875, "completions/mean_terminated_length": 738.771240234375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.466677321400032, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12496597918471945, "kl": 0.02685546875, "learning_rate": 6.795364907439569e-07, "loss": 0.0382, "num_tokens": 1256329493.0, "reward": 2.2154018878936768, "reward_std": 0.4397036135196686, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.2016845941543579, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 920.8928833007812, "completions/mean_terminated_length": 708.6259765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4668904160673379, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15477025607048256, "kl": 0.0318603515625, "learning_rate": 6.792158586507961e-07, "loss": 0.0531, "num_tokens": 1256807221.0, "reward": 2.342076063156128, "reward_std": 0.48274508118629456, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9358258843421936, "rewards/tag_count_reward/std": 0.20187872648239136, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1001.4777221679688, "completions/mean_terminated_length": 794.4118041992188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.46710351073464385, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1289801018572182, "kl": 0.029205322265625, "learning_rate": 6.788951550465483e-07, "loss": 0.074, "num_tokens": 1257321067.0, "reward": 2.185267925262451, "reward_std": 0.4656493663787842, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.16372442245483398, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 895.0223388671875, "completions/mean_terminated_length": 723.5538940429688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4673166054019498, "frac_reward_zero_std": 0.0, "grad_norm": 0.14544900586030074, "kl": 0.03033447265625, "learning_rate": 6.78574380108698e-07, "loss": 0.0502, "num_tokens": 1257791029.0, "reward": 2.3526787757873535, "reward_std": 0.4276007413864136, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1570618450641632, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 776.5714721679688, "completions/mean_terminated_length": 634.6004638671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.46752970006925576, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.4323689074603266, "kl": 0.033233642578125, "learning_rate": 6.782535340147702e-07, "loss": 0.0367, "num_tokens": 1258211429.0, "reward": 2.37890625, "reward_std": 0.3621874153614044, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18153512477874756, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 929.7567138671875, "completions/mean_terminated_length": 739.9765014648438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4677427947365617, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12852082735331846, "kl": 0.028472900390625, "learning_rate": 6.779326169423279e-07, "loss": 0.0633, "num_tokens": 1258704456.0, "reward": 2.4190850257873535, "reward_std": 0.41469284892082214, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15756873786449432, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1060.6763916015625, "completions/mean_terminated_length": 871.6143188476562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4679558894038677, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12168705589983117, "kl": 0.025054931640625, "learning_rate": 6.776116290689748e-07, "loss": 0.0785, "num_tokens": 1259251447.0, "reward": 2.322544813156128, "reward_std": 0.4087196886539459, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1474141627550125, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 920.044677734375, "completions/mean_terminated_length": 768.69873046875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.46816898407117363, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16256680582418945, "kl": 0.02911376953125, "learning_rate": 6.772905705723527e-07, "loss": 0.0758, "num_tokens": 1259730587.0, "reward": 2.454799175262451, "reward_std": 0.46542397141456604, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18684886395931244, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 897.6629638671875, "completions/mean_terminated_length": 709.4259643554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4683820787384796, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1358733775767159, "kl": 0.02886962890625, "learning_rate": 6.769694416301431e-07, "loss": 0.089, "num_tokens": 1260197908.0, "reward": 2.5279018878936768, "reward_std": 0.4194980561733246, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1573437601327896, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1025.546875, "completions/mean_terminated_length": 754.0480346679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46859517340578555, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1267123037546443, "kl": 0.02593994140625, "learning_rate": 6.766482424200663e-07, "loss": 0.106, "num_tokens": 1260726265.0, "reward": 2.396205425262451, "reward_std": 0.44485175609588623, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.18929573893547058, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 885.732177734375, "completions/mean_terminated_length": 712.882080078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.46880826807309145, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.21485344973014767, "kl": 0.033203125, "learning_rate": 6.763269731198813e-07, "loss": 0.0604, "num_tokens": 1261195905.0, "reward": 2.5284600257873535, "reward_std": 0.3913387060165405, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9709821343421936, "rewards/format_reward/std": 0.16804419457912445, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1340305656194687, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 874.2098388671875, "completions/mean_terminated_length": 730.0601196289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4690213627403974, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1435109214271408, "kl": 0.029388427734375, "learning_rate": 6.760056339073863e-07, "loss": 0.0693, "num_tokens": 1261659055.0, "reward": 2.4559152126312256, "reward_std": 0.42137643694877625, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 891.9531860351562, "completions/mean_terminated_length": 740.1489868164062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.46923445740770336, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13142701975802526, "kl": 0.02764892578125, "learning_rate": 6.756842249604176e-07, "loss": 0.0653, "num_tokens": 1262126938.0, "reward": 2.5066964626312256, "reward_std": 0.4629294276237488, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1457662582397461, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1027.982177734375, "completions/mean_terminated_length": 835.88330078125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4694475520750093, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12128582979291838, "kl": 0.025482177734375, "learning_rate": 6.75362746456851e-07, "loss": 0.0746, "num_tokens": 1262659554.0, "reward": 2.4213171005249023, "reward_std": 0.3704850971698761, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9665178656578064, "rewards/format_reward/std": 0.1800929754972458, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14945264160633087, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1002.310302734375, "completions/mean_terminated_length": 771.5177001953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4696606467423153, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13393528617871056, "kl": 0.02520751953125, "learning_rate": 6.750411985745999e-07, "loss": 0.0955, "num_tokens": 1263174221.0, "reward": 2.4330358505249023, "reward_std": 0.4601520299911499, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17527315020561218, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 900.7433471679688, "completions/mean_terminated_length": 753.3626708984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.46987374140962124, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1366622981075897, "kl": 0.02740478515625, "learning_rate": 6.747195814916165e-07, "loss": 0.0941, "num_tokens": 1263652698.0, "reward": 2.4614956378936768, "reward_std": 0.34770047664642334, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9709821343421936, "rewards/format_reward/std": 0.16804419457912445, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.1634640097618103, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 892.9464721679688, "completions/mean_terminated_length": 760.776123046875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4700868360769272, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14632863444342128, "kl": 0.02874755859375, "learning_rate": 6.74397895385891e-07, "loss": 0.0854, "num_tokens": 1264119874.0, "reward": 2.5027902126312256, "reward_std": 0.4269680380821228, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14827017486095428, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 769.7277221679688, "completions/mean_terminated_length": 651.253662109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47029993074423315, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.156723752788265, "kl": 0.0318603515625, "learning_rate": 6.740761404354523e-07, "loss": 0.066, "num_tokens": 1264530840.0, "reward": 2.4737725257873535, "reward_std": 0.3658413290977478, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775348186493, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14149756729602814, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 845.279052734375, "completions/mean_terminated_length": 687.345947265625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.47051302541153905, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14860018042050208, "kl": 0.03118896484375, "learning_rate": 6.737543168183671e-07, "loss": 0.0812, "num_tokens": 1264978661.0, "reward": 2.5418527126312256, "reward_std": 0.38505449891090393, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1648557037115097, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 824.2857666015625, "completions/mean_terminated_length": 670.552734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.470726120078845, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13552912912935366, "kl": 0.0296630859375, "learning_rate": 6.734324247127402e-07, "loss": 0.048, "num_tokens": 1265417189.0, "reward": 2.591517925262451, "reward_std": 0.34443041682243347, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9776785969734192, "rewards/format_reward/std": 0.1478918492794037, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 888.2410888671875, "completions/mean_terminated_length": 745.8145141601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47093921474615097, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14408904699239972, "kl": 0.03131103515625, "learning_rate": 6.731104642967143e-07, "loss": 0.1076, "num_tokens": 1265882385.0, "reward": 2.4458706378936768, "reward_std": 0.4000353217124939, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1014.0781860351562, "completions/mean_terminated_length": 771.9752197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4711523094134569, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13866595416311292, "kl": 0.02679443359375, "learning_rate": 6.727884357484695e-07, "loss": 0.0759, "num_tokens": 1266407444.0, "reward": 2.381138563156128, "reward_std": 0.36371123790740967, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 897.7277221679688, "completions/mean_terminated_length": 749.9596557617188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4713654040807629, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12317114249750778, "kl": 0.0274658203125, "learning_rate": 6.724663392462241e-07, "loss": 0.0458, "num_tokens": 1266874906.0, "reward": 2.443080425262451, "reward_std": 0.327858567237854, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.49883854389190674, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1613859087228775, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 820.2120971679688, "completions/mean_terminated_length": 651.9365234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47157849874806884, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.17319751460156835, "kl": 0.03863525390625, "learning_rate": 6.721441749682341e-07, "loss": 0.0401, "num_tokens": 1267311049.0, "reward": 2.5340402126312256, "reward_std": 0.36192551255226135, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 992.49560546875, "completions/mean_terminated_length": 763.0380859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4717915934153748, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13811273338012478, "kl": 0.02783203125, "learning_rate": 6.718219430927924e-07, "loss": 0.0779, "num_tokens": 1267832919.0, "reward": 2.4838171005249023, "reward_std": 0.37745755910873413, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14323382079601288, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1001.3594360351562, "completions/mean_terminated_length": 759.8269653320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47200468808268076, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12090751907188951, "kl": 0.02569580078125, "learning_rate": 6.714996437982301e-07, "loss": 0.0855, "num_tokens": 1268353544.0, "reward": 2.388392925262451, "reward_std": 0.44244250655174255, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 958.2388916015625, "completions/mean_terminated_length": 756.4312133789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47221778274998666, "frac_reward_zero_std": 0.25, "grad_norm": 0.1353881829154153, "kl": 0.028533935546875, "learning_rate": 6.711772772629149e-07, "loss": 0.0763, "num_tokens": 1268847075.0, "reward": 2.4363839626312256, "reward_std": 0.3675597310066223, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389532685279846, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 775.1406860351562, "completions/mean_terminated_length": 683.7870483398438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4724308774172926, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.17928233619134123, "kl": 0.038177490234375, "learning_rate": 6.708548436652522e-07, "loss": 0.0627, "num_tokens": 1269263170.0, "reward": 2.48828125, "reward_std": 0.38996705412864685, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1258348822593689, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 843.4219360351562, "completions/mean_terminated_length": 681.794921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4726439720845986, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12890393591896093, "kl": 0.032928466796875, "learning_rate": 6.70532343183684e-07, "loss": 0.0483, "num_tokens": 1269707967.0, "reward": 2.484375, "reward_std": 0.3472144305706024, "rewards/accuracy_reward/mean": 0.5810185074806213, "rewards/accuracy_reward/std": 0.49396437406539917, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13422717154026031, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 881.9219360351562, "completions/mean_terminated_length": 725.4607543945312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.47285706675190453, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14444850950268015, "kl": 0.031494140625, "learning_rate": 6.702097759966897e-07, "loss": 0.097, "num_tokens": 1270170892.0, "reward": 2.46484375, "reward_std": 0.3814384341239929, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.0911129042506218, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 778.966552734375, "completions/mean_terminated_length": 674.746337890625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4730701614192105, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.16281714746927378, "kl": 0.0347900390625, "learning_rate": 6.698871422827857e-07, "loss": 0.0518, "num_tokens": 1270580365.0, "reward": 2.4285714626312256, "reward_std": 0.3740359842777252, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1029.1273193359375, "completions/mean_terminated_length": 790.5482177734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.47328325608651645, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13704965481849363, "kl": 0.027740478515625, "learning_rate": 6.695644422205252e-07, "loss": 0.0744, "num_tokens": 1271108262.0, "reward": 2.3275671005249023, "reward_std": 0.48285603523254395, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.19066503643989563, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 903.9085083007812, "completions/mean_terminated_length": 760.1784057617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4734963507538224, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1342017231116232, "kl": 0.03118896484375, "learning_rate": 6.692416759884978e-07, "loss": 0.088, "num_tokens": 1271585245.0, "reward": 2.3984375, "reward_std": 0.41601473093032837, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12805373966693878, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 845.997802734375, "completions/mean_terminated_length": 698.3834838867188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.47370944542112836, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14661070479050456, "kl": 0.031219482421875, "learning_rate": 6.689188437653298e-07, "loss": 0.0903, "num_tokens": 1272027564.0, "reward": 2.4693081378936768, "reward_std": 0.42670324444770813, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15604011714458466, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 947.5402221679688, "completions/mean_terminated_length": 827.6881103515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.47392254008843426, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12620281372711523, "kl": 0.02490234375, "learning_rate": 6.685959457296842e-07, "loss": 0.0778, "num_tokens": 1272524782.0, "reward": 2.4263393878936768, "reward_std": 0.4004477560520172, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1249600425362587, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 895.622802734375, "completions/mean_terminated_length": 737.6827392578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4741356347557402, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1317392542986843, "kl": 0.029998779296875, "learning_rate": 6.682729820602605e-07, "loss": 0.0906, "num_tokens": 1272991237.0, "reward": 2.4676339626312256, "reward_std": 0.4688414931297302, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1286761611700058, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 975.05810546875, "completions/mean_terminated_length": 802.72021484375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4743487294230462, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1256100173129875, "kl": 0.025970458984375, "learning_rate": 6.679499529357943e-07, "loss": 0.1131, "num_tokens": 1273498287.0, "reward": 2.4386162757873535, "reward_std": 0.4945499897003174, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14839327335357666, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 880.9553833007812, "completions/mean_terminated_length": 707.3948974609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47456182409035214, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.40490642978055996, "kl": 0.046295166015625, "learning_rate": 6.676268585350571e-07, "loss": 0.0894, "num_tokens": 1273958555.0, "reward": 2.359375, "reward_std": 0.46043434739112854, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.17017030715942383, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1101.8660888671875, "completions/mean_terminated_length": 873.8504028320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4747749187576581, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12181976137174877, "kl": 0.023956298828125, "learning_rate": 6.673036990368579e-07, "loss": 0.0739, "num_tokens": 1274526671.0, "reward": 2.3002233505249023, "reward_std": 0.4114280641078949, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14360485970973969, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 965.2745971679688, "completions/mean_terminated_length": 774.8740234375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.47498801342496405, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14857873051432402, "kl": 0.026885986328125, "learning_rate": 6.669804746200396e-07, "loss": 0.1138, "num_tokens": 1275026266.0, "reward": 2.2862725257873535, "reward_std": 0.515708863735199, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1514935940504074, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 899.4866333007812, "completions/mean_terminated_length": 721.8814086914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47520110809227, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.19132930027868855, "kl": 0.034027099609375, "learning_rate": 6.666571854634828e-07, "loss": 0.1158, "num_tokens": 1275502292.0, "reward": 2.4302456378936768, "reward_std": 0.4945792257785797, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692909002304, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 980.529052734375, "completions/mean_terminated_length": 779.4933471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47541420275957597, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371905268921433, "kl": 0.027740478515625, "learning_rate": 6.663338317461031e-07, "loss": 0.0971, "num_tokens": 1276005537.0, "reward": 2.3582589626312256, "reward_std": 0.5346171855926514, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.8794642686843872, "rewards/format_reward/std": 0.3259509205818176, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16375111043453217, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1028.1875, "completions/mean_terminated_length": 813.2000122070312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.47562729742688187, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12728763954654915, "kl": 0.023345947265625, "learning_rate": 6.660104136468524e-07, "loss": 0.1045, "num_tokens": 1276549685.0, "reward": 2.232142925262451, "reward_std": 0.4899819493293762, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.8638392686843872, "rewards/format_reward/std": 0.34334254264831543, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15989768505096436, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 857.9710083007812, "completions/mean_terminated_length": 708.4698486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4758403920941878, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15562283024416393, "kl": 0.03094482421875, "learning_rate": 6.656869313447175e-07, "loss": 0.1107, "num_tokens": 1277002664.0, "reward": 2.427455425262451, "reward_std": 0.5242178440093994, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.8683035969734192, "rewards/format_reward/std": 0.3385384678840637, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15374813973903656, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 927.2879638671875, "completions/mean_terminated_length": 719.7486572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4760534867614938, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1467290754377202, "kl": 0.03033447265625, "learning_rate": 6.653633850187211e-07, "loss": 0.1399, "num_tokens": 1277487673.0, "reward": 2.33203125, "reward_std": 0.5383273959159851, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.8504464030265808, "rewards/format_reward/std": 0.3570319712162018, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.18044531345367432, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 857.8035888671875, "completions/mean_terminated_length": 698.1063232421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.47626658142879974, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12760507720636563, "kl": 0.029449462890625, "learning_rate": 6.650397748479214e-07, "loss": 0.0663, "num_tokens": 1277943169.0, "reward": 2.3621652126312256, "reward_std": 0.4935818314552307, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.8638392686843872, "rewards/format_reward/std": 0.34334254264831543, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1199323832988739, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 851.4754638671875, "completions/mean_terminated_length": 721.160888671875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4764796760961057, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15966362366683684, "kl": 0.031524658203125, "learning_rate": 6.64716101011412e-07, "loss": 0.1179, "num_tokens": 1278393318.0, "reward": 2.3621652126312256, "reward_std": 0.5111830234527588, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16767139732837677, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 896.982177734375, "completions/mean_terminated_length": 755.6290893554688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.47669277076341166, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12489564653119974, "kl": 0.02880859375, "learning_rate": 6.643923636883213e-07, "loss": 0.0223, "num_tokens": 1278859998.0, "reward": 2.45703125, "reward_std": 0.45318177342414856, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1293378323316574, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 931.982177734375, "completions/mean_terminated_length": 739.1622924804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4769058654307176, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13247195221079344, "kl": 0.02728271484375, "learning_rate": 6.640685630578132e-07, "loss": 0.0914, "num_tokens": 1279346102.0, "reward": 2.3325893878936768, "reward_std": 0.5138127207756042, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3480229377746582, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 946.2277221679688, "completions/mean_terminated_length": 749.0684814453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47711896009802357, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13901713735170673, "kl": 0.02899169921875, "learning_rate": 6.63744699299087e-07, "loss": 0.1297, "num_tokens": 1279836572.0, "reward": 2.279017925262451, "reward_std": 0.5506203174591064, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.8571428656578064, "rewards/format_reward/std": 0.3503182828426361, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17999590933322906, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 994.8817138671875, "completions/mean_terminated_length": 772.8729858398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47733205476532947, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14794792537225845, "kl": 0.025238037109375, "learning_rate": 6.634207725913759e-07, "loss": 0.0665, "num_tokens": 1280350503.0, "reward": 2.2879464626312256, "reward_std": 0.4296041429042816, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.315234512090683, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14767222106456757, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 971.8058471679688, "completions/mean_terminated_length": 741.401123046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47754514943263543, "frac_reward_zero_std": 0.0, "grad_norm": 2.7126879225135037, "kl": 0.12786865234375, "learning_rate": 6.630967831139489e-07, "loss": 0.1403, "num_tokens": 1280864944.0, "reward": 2.2310268878936768, "reward_std": 0.5610305666923523, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.8147321343421936, "rewards/format_reward/std": 0.38894903659820557, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18391697108745575, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 799.4799194335938, "completions/mean_terminated_length": 673.7075805664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4777582440999414, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14736298039460333, "kl": 0.033477783203125, "learning_rate": 6.627727310461091e-07, "loss": 0.0927, "num_tokens": 1281282519.0, "reward": 2.4921875, "reward_std": 0.478099524974823, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1253739446401596, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 950.513427734375, "completions/mean_terminated_length": 704.62841796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.47797133876724734, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1254623105078358, "kl": 0.026214599609375, "learning_rate": 6.624486165671948e-07, "loss": 0.0536, "num_tokens": 1281779117.0, "reward": 2.37890625, "reward_std": 0.39184245467185974, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780036240816116, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 967.5491333007812, "completions/mean_terminated_length": 797.2454833984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4781844334345533, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1311891824439268, "kl": 0.028594970703125, "learning_rate": 6.621244398565784e-07, "loss": 0.0573, "num_tokens": 1282287523.0, "reward": 2.4481027126312256, "reward_std": 0.45720866322517395, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11938986927270889, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 875.310302734375, "completions/mean_terminated_length": 683.4155883789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47839752810185926, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1210130054182694, "kl": 0.0299072265625, "learning_rate": 6.61800201093667e-07, "loss": 0.0265, "num_tokens": 1282750878.0, "reward": 2.4185268878936768, "reward_std": 0.4307468831539154, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 855.5714721679688, "completions/mean_terminated_length": 719.1243896484375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4786106227691652, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14916156842532785, "kl": 0.030120849609375, "learning_rate": 6.614759004579019e-07, "loss": 0.0991, "num_tokens": 1283205358.0, "reward": 2.4581475257873535, "reward_std": 0.4494197964668274, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12109260261058807, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 819.6049194335938, "completions/mean_terminated_length": 715.503662109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4788237174364712, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13958415695662107, "kl": 0.031982421875, "learning_rate": 6.611515381287584e-07, "loss": 0.0495, "num_tokens": 1283640861.0, "reward": 2.474330425262451, "reward_std": 0.42102277278900146, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12848198413848877, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 969.71435546875, "completions/mean_terminated_length": 773.4037475585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4790368121037771, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.6560849333039577, "kl": 0.031158447265625, "learning_rate": 6.608271142857467e-07, "loss": 0.0627, "num_tokens": 1284153917.0, "reward": 2.263951063156128, "reward_std": 0.43909579515457153, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387789011001587, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.16219128668308258, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 934.8995971679688, "completions/mean_terminated_length": 745.9921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.47924990677108303, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1441793425567929, "kl": 0.028472900390625, "learning_rate": 6.605026291084103e-07, "loss": 0.0917, "num_tokens": 1284651456.0, "reward": 2.4464287757873535, "reward_std": 0.43612557649612427, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1101.540283203125, "completions/mean_terminated_length": 860.2857055664062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.479463001438389, "frac_reward_zero_std": 0.0, "grad_norm": 0.1268967009958678, "kl": 0.024505615234375, "learning_rate": 6.601780827763268e-07, "loss": 0.0677, "num_tokens": 1285216850.0, "reward": 2.3370537757873535, "reward_std": 0.5114197731018066, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.1634649783372879, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 959.3683471679688, "completions/mean_terminated_length": 800.6675415039062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.47967609610569495, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1359626959900769, "kl": 0.025665283203125, "learning_rate": 6.598534754691082e-07, "loss": 0.0691, "num_tokens": 1285728231.0, "reward": 2.3270089626312256, "reward_std": 0.45948970317840576, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15788237750530243, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 892.4866333007812, "completions/mean_terminated_length": 685.7105712890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4798891907730009, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14413447586242895, "kl": 0.031036376953125, "learning_rate": 6.595288073663992e-07, "loss": 0.0982, "num_tokens": 1286192369.0, "reward": 2.4620537757873535, "reward_std": 0.442236989736557, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 896.0848388671875, "completions/mean_terminated_length": 693.5170288085938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.48010228544030686, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13026983204389048, "kl": 0.029022216796875, "learning_rate": 6.592040786478794e-07, "loss": 0.051, "num_tokens": 1286660279.0, "reward": 2.3560268878936768, "reward_std": 0.4239169657230377, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3310886323451996, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.151918426156044, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1056.15625, "completions/mean_terminated_length": 840.5380859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4803153801076128, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12667379774917495, "kl": 0.024383544921875, "learning_rate": 6.588792894932605e-07, "loss": 0.0708, "num_tokens": 1287205261.0, "reward": 2.3526787757873535, "reward_std": 0.48564672470092773, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17041954398155212, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 883.3973388671875, "completions/mean_terminated_length": 706.7609252929688, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.4805284747749188, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1438943400326947, "kl": 0.03204345703125, "learning_rate": 6.585544400822891e-07, "loss": 0.0701, "num_tokens": 1287671823.0, "reward": 2.3309152126312256, "reward_std": 0.37643617391586304, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.151995450258255, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1002.7567138671875, "completions/mean_terminated_length": 785.8193969726562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4807415694422247, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13384297494057554, "kl": 0.026580810546875, "learning_rate": 6.582295305947442e-07, "loss": 0.1164, "num_tokens": 1288191362.0, "reward": 2.435826063156128, "reward_std": 0.4654217064380646, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1590748131275177, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 856.7857666015625, "completions/mean_terminated_length": 723.771728515625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.48095466410953064, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15303098879110114, "kl": 0.033416748046875, "learning_rate": 6.579045612104384e-07, "loss": 0.1397, "num_tokens": 1288646898.0, "reward": 2.415736675262451, "reward_std": 0.48180699348449707, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.1983288675546646, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 986.3058471679688, "completions/mean_terminated_length": 755.5027465820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4811677587768366, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13559269148845762, "kl": 0.0257568359375, "learning_rate": 6.575795321092173e-07, "loss": 0.1157, "num_tokens": 1289157803.0, "reward": 2.5189733505249023, "reward_std": 0.41055232286453247, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399910926818848, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 882.5469360351562, "completions/mean_terminated_length": 722.814697265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48138085344414255, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14875046399111191, "kl": 0.030609130859375, "learning_rate": 6.572544434709597e-07, "loss": 0.064, "num_tokens": 1289619376.0, "reward": 2.5011162757873535, "reward_std": 0.40965041518211365, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.49251341819763184, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1236090287566185, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 923.3660888671875, "completions/mean_terminated_length": 715.1005249023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4815939481114485, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1331591315388688, "kl": 0.029998779296875, "learning_rate": 6.569292954755773e-07, "loss": 0.0709, "num_tokens": 1290097284.0, "reward": 2.416294813156128, "reward_std": 0.431540310382843, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1631094515323639, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 960.6295166015625, "completions/mean_terminated_length": 782.6961059570312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.48180704277875447, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14204432141140563, "kl": 0.027252197265625, "learning_rate": 6.566040883030146e-07, "loss": 0.1012, "num_tokens": 1290598254.0, "reward": 2.388392925262451, "reward_std": 0.39278843998908997, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14767220616340637, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 968.8125610351562, "completions/mean_terminated_length": 723.4082641601562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4820201374460604, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14223632467210331, "kl": 0.030517578125, "learning_rate": 6.562788221332488e-07, "loss": 0.1363, "num_tokens": 1291103626.0, "reward": 2.4659600257873535, "reward_std": 0.4136297404766083, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.16194480657577515, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 868.3482666015625, "completions/mean_terminated_length": 730.0848388671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.4822332321133664, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1250078107907124, "kl": 0.02972412109375, "learning_rate": 6.559534971462901e-07, "loss": 0.1066, "num_tokens": 1291559462.0, "reward": 2.3995537757873535, "reward_std": 0.32701024413108826, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 945.6160888671875, "completions/mean_terminated_length": 807.1256103515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48244632678067234, "frac_reward_zero_std": 0.0, "grad_norm": 0.13173478626303223, "kl": 0.028564453125, "learning_rate": 6.556281135221806e-07, "loss": 0.0344, "num_tokens": 1292047194.0, "reward": 2.421875, "reward_std": 0.456134170293808, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15220166742801666, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 889.4085083007812, "completions/mean_terminated_length": 727.2646484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48265942144797824, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15084527769165776, "kl": 0.031585693359375, "learning_rate": 6.553026714409954e-07, "loss": 0.1309, "num_tokens": 1292514417.0, "reward": 2.4877233505249023, "reward_std": 0.4870128035545349, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17070867121219635, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 872.9263916015625, "completions/mean_terminated_length": 725.3040161132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4828725161152842, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15039152057329863, "kl": 0.03240966796875, "learning_rate": 6.549771710828418e-07, "loss": 0.0873, "num_tokens": 1292976240.0, "reward": 2.5078125, "reward_std": 0.4377419054508209, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13607001304626465, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 934.4397583007812, "completions/mean_terminated_length": 788.2146606445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48308561078259016, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1460816354359092, "kl": 0.027740478515625, "learning_rate": 6.546516126278594e-07, "loss": 0.0892, "num_tokens": 1293462037.0, "reward": 2.3074777126312256, "reward_std": 0.3869589567184448, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 860.8281860351562, "completions/mean_terminated_length": 724.9825439453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4832987054498961, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1325369125378957, "kl": 0.02899169921875, "learning_rate": 6.543259962562196e-07, "loss": 0.0666, "num_tokens": 1293912584.0, "reward": 2.435826063156128, "reward_std": 0.3635029196739197, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.126290425658226, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1024.3170166015625, "completions/mean_terminated_length": 805.1544799804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4835118001172021, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13027425240007753, "kl": 0.02423095703125, "learning_rate": 6.540003221481267e-07, "loss": 0.1066, "num_tokens": 1294445366.0, "reward": 2.3582589626312256, "reward_std": 0.4220695197582245, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14886364340782166, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 908.9263916015625, "completions/mean_terminated_length": 712.123046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.48372489478450803, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13369689805636678, "kl": 0.02984619140625, "learning_rate": 6.536745904838158e-07, "loss": 0.0618, "num_tokens": 1294923365.0, "reward": 2.3973214626312256, "reward_std": 0.4122970402240753, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14140157401561737, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 886.2991333007812, "completions/mean_terminated_length": 706.6546020507812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.483937989451814, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.19779333977512292, "kl": 0.033447265625, "learning_rate": 6.533488014435547e-07, "loss": 0.0849, "num_tokens": 1295385067.0, "reward": 2.493861675262451, "reward_std": 0.41481515765190125, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14149756729602814, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 847.2835083007812, "completions/mean_terminated_length": 709.8880615234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48415108411911995, "frac_reward_zero_std": 0.25, "grad_norm": 0.13033102980848388, "kl": 0.03125, "learning_rate": 6.530229552076428e-07, "loss": 0.1271, "num_tokens": 1295830746.0, "reward": 2.4849331378936768, "reward_std": 0.3559582829475403, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.13252247869968414, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1057.149658203125, "completions/mean_terminated_length": 835.1557006835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48436417878642585, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1312186455198644, "kl": 0.026397705078125, "learning_rate": 6.526970519564109e-07, "loss": 0.0979, "num_tokens": 1296374317.0, "reward": 2.3404018878936768, "reward_std": 0.48946699500083923, "rewards/accuracy_reward/mean": 0.4722222089767456, "rewards/accuracy_reward/std": 0.49980661273002625, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.161169171333313, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 902.6138916015625, "completions/mean_terminated_length": 722.074951171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4845772734537318, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.19680341486418076, "kl": 0.0323486328125, "learning_rate": 6.523710918702215e-07, "loss": 0.1105, "num_tokens": 1296853168.0, "reward": 2.3777902126312256, "reward_std": 0.4936728775501251, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14052370190620422, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1042.13623046875, "completions/mean_terminated_length": 760.4942626953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48479036812103776, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14608437731654025, "kl": 0.024688720703125, "learning_rate": 6.520450751294685e-07, "loss": 0.1392, "num_tokens": 1297396829.0, "reward": 2.302455425262451, "reward_std": 0.5085361003875732, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17337898910045624, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 821.185302734375, "completions/mean_terminated_length": 670.5238037109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4850034627883437, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14799577977550182, "kl": 0.03216552734375, "learning_rate": 6.517190019145773e-07, "loss": 0.0952, "num_tokens": 1297835984.0, "reward": 2.3487725257873535, "reward_std": 0.4308519959449768, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15514148771762848, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 800.7969360351562, "completions/mean_terminated_length": 647.631591796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.4852165574556497, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14685880925108288, "kl": 0.0330810546875, "learning_rate": 6.513928724060046e-07, "loss": 0.0677, "num_tokens": 1298262997.0, "reward": 2.4871652126312256, "reward_std": 0.41233643889427185, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16169020533561707, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 859.8772583007812, "completions/mean_terminated_length": 727.2084350585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48542965212295563, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13130114987824393, "kl": 0.030181884765625, "learning_rate": 6.510666867842378e-07, "loss": 0.102, "num_tokens": 1298717022.0, "reward": 2.4910714626312256, "reward_std": 0.4890372157096863, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.13043475151062012, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 921.4420166015625, "completions/mean_terminated_length": 757.2122802734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4856427467902616, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12233424875395836, "kl": 0.028045654296875, "learning_rate": 6.50740445229796e-07, "loss": 0.0814, "num_tokens": 1299202340.0, "reward": 2.470982313156128, "reward_std": 0.43373480439186096, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.09686583280563354, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 961.2344360351562, "completions/mean_terminated_length": 805.9821166992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48585584145756755, "frac_reward_zero_std": 0.0, "grad_norm": 0.13977673098951304, "kl": 0.0272216796875, "learning_rate": 6.504141479232287e-07, "loss": 0.0755, "num_tokens": 1299704349.0, "reward": 2.415736675262451, "reward_std": 0.5184842348098755, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14720545709133148, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 911.2813110351562, "completions/mean_terminated_length": 742.2307739257812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48606893612487345, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13926447540570908, "kl": 0.03070068359375, "learning_rate": 6.500877950451167e-07, "loss": 0.0714, "num_tokens": 1300178395.0, "reward": 2.3794643878936768, "reward_std": 0.42163488268852234, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995505809783936, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14040927588939667, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 867.8170166015625, "completions/mean_terminated_length": 695.7698364257812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4862820307921794, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13024994210825913, "kl": 0.03045654296875, "learning_rate": 6.497613867760711e-07, "loss": 0.0084, "num_tokens": 1300635369.0, "reward": 2.4107143878936768, "reward_std": 0.4235752820968628, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12672585248947144, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 955.3772583007812, "completions/mean_terminated_length": 721.455322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48649512545948537, "frac_reward_zero_std": 0.0, "grad_norm": 0.15891920959067704, "kl": 0.032745361328125, "learning_rate": 6.494349232967341e-07, "loss": 0.098, "num_tokens": 1301134674.0, "reward": 2.3387277126312256, "reward_std": 0.5449174046516418, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390644073486, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.17089413106441498, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 896.4375610351562, "completions/mean_terminated_length": 672.2666625976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4867082201267913, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.154094563296589, "kl": 0.0306396484375, "learning_rate": 6.491084047877781e-07, "loss": 0.1142, "num_tokens": 1301608598.0, "reward": 2.4129464626312256, "reward_std": 0.4288695752620697, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 890.4219360351562, "completions/mean_terminated_length": 721.6700439453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4869213147940973, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14596596261185982, "kl": 0.031707763671875, "learning_rate": 6.487818314299062e-07, "loss": 0.0928, "num_tokens": 1302072915.0, "reward": 2.388951063156128, "reward_std": 0.474895715713501, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1448889821767807, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1012.6027221679688, "completions/mean_terminated_length": 780.62841796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48713440946140324, "frac_reward_zero_std": 0.0, "grad_norm": 0.1552924602429743, "kl": 0.02838134765625, "learning_rate": 6.484552034038515e-07, "loss": 0.1305, "num_tokens": 1302600577.0, "reward": 2.3097100257873535, "reward_std": 0.5640348196029663, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17044061422348022, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 900.8504638671875, "completions/mean_terminated_length": 723.4561767578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4873475041287092, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1473554784091451, "kl": 0.029571533203125, "learning_rate": 6.481285208903781e-07, "loss": 0.0817, "num_tokens": 1303072190.0, "reward": 2.4213171005249023, "reward_std": 0.40599721670150757, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13927440345287323, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 901.6094360351562, "completions/mean_terminated_length": 751.0732421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48756059879601515, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13925707693131345, "kl": 0.02801513671875, "learning_rate": 6.478017840702793e-07, "loss": 0.0772, "num_tokens": 1303547167.0, "reward": 2.4838171005249023, "reward_std": 0.459294855594635, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13359208405017853, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 948.21435546875, "completions/mean_terminated_length": 768.2493286132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48777369346332106, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12104823943059272, "kl": 0.0294189453125, "learning_rate": 6.474749931243791e-07, "loss": 0.0461, "num_tokens": 1304044303.0, "reward": 2.4302456378936768, "reward_std": 0.4273103177547455, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 825.0335083007812, "completions/mean_terminated_length": 660.9392700195312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.487986788130627, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14631506269878528, "kl": 0.032806396484375, "learning_rate": 6.471481482335315e-07, "loss": 0.0983, "num_tokens": 1304481998.0, "reward": 2.501674175262451, "reward_std": 0.4627715051174164, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1064.107177734375, "completions/mean_terminated_length": 833.718994140625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.48819988279793297, "frac_reward_zero_std": 0.0, "grad_norm": 0.13191893932490797, "kl": 0.025848388671875, "learning_rate": 6.468212495786196e-07, "loss": 0.1085, "num_tokens": 1305027390.0, "reward": 2.3309152126312256, "reward_std": 0.5329724550247192, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17015472054481506, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 945.8906860351562, "completions/mean_terminated_length": 741.7962646484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48841297746523893, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14282174751279508, "kl": 0.02899169921875, "learning_rate": 6.464942973405573e-07, "loss": 0.1024, "num_tokens": 1305519341.0, "reward": 2.4174108505249023, "reward_std": 0.477356493473053, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855977296829224, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15415765345096588, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 914.9933471679688, "completions/mean_terminated_length": 733.0077514648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4886260721325449, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1352025359548213, "kl": 0.02703857421875, "learning_rate": 6.461672917002873e-07, "loss": 0.0956, "num_tokens": 1305994138.0, "reward": 2.515625, "reward_std": 0.4342189133167267, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10688953846693039, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 971.7678833007812, "completions/mean_terminated_length": 789.1174926757812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48883916679985084, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1448599229780544, "kl": 0.0264892578125, "learning_rate": 6.458402328387826e-07, "loss": 0.0982, "num_tokens": 1306502194.0, "reward": 2.3113839626312256, "reward_std": 0.4006907045841217, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12975822389125824, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 971.2813110351562, "completions/mean_terminated_length": 751.3064575195312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4890522614671568, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1124431648248687, "kl": 0.026092529296875, "learning_rate": 6.455131209370447e-07, "loss": 0.0613, "num_tokens": 1307005232.0, "reward": 2.5072546005249023, "reward_std": 0.3755997121334076, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10500273108482361, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 920.185302734375, "completions/mean_terminated_length": 755.7723999023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48926535613446276, "frac_reward_zero_std": 0.25, "grad_norm": 0.12673188322453885, "kl": 0.02935791015625, "learning_rate": 6.451859561761054e-07, "loss": 0.0314, "num_tokens": 1307484531.0, "reward": 2.4888393878936768, "reward_std": 0.37483373284339905, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13731664419174194, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 968.794677734375, "completions/mean_terminated_length": 751.7962646484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.48947845080176866, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1263471003296918, "kl": 0.02716064453125, "learning_rate": 6.448587387370249e-07, "loss": 0.0986, "num_tokens": 1307987831.0, "reward": 2.466517925262451, "reward_std": 0.45208361744880676, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 922.1116333007812, "completions/mean_terminated_length": 734.4635620117188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4896915454690746, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 31.967076941628456, "kl": 1.966094970703125, "learning_rate": 6.445314688008937e-07, "loss": 0.1528, "num_tokens": 1308476441.0, "reward": 2.5078125, "reward_std": 0.4202045202255249, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 908.279052734375, "completions/mean_terminated_length": 752.0736083984375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4899046401363806, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1648802877687333, "kl": 0.033477783203125, "learning_rate": 6.442041465488301e-07, "loss": 0.1023, "num_tokens": 1308953478.0, "reward": 2.474330425262451, "reward_std": 0.42755141854286194, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09310024231672287, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1010.341552734375, "completions/mean_terminated_length": 798.3468017578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49011773480368653, "frac_reward_zero_std": 0.25, "grad_norm": 0.10344181008345053, "kl": 0.026031494140625, "learning_rate": 6.438767721619824e-07, "loss": 0.0688, "num_tokens": 1309476623.0, "reward": 2.5284600257873535, "reward_std": 0.3568534255027771, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.12170960009098053, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 941.357177734375, "completions/mean_terminated_length": 756.9166870117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4903308294709925, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13165597797358203, "kl": 0.029388427734375, "learning_rate": 6.435493458215266e-07, "loss": 0.04, "num_tokens": 1309970783.0, "reward": 2.4720983505249023, "reward_std": 0.3915558159351349, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266101151704788, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 954.4866333007812, "completions/mean_terminated_length": 748.54638671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49054392413829845, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1352858919809436, "kl": 0.03277587890625, "learning_rate": 6.432218677086686e-07, "loss": 0.0793, "num_tokens": 1310461657.0, "reward": 2.396763563156128, "reward_std": 0.41674861311912537, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13463464379310608, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 956.9308471679688, "completions/mean_terminated_length": 784.9534912109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4907570188056044, "frac_reward_zero_std": 0.0, "grad_norm": 0.14683353277306666, "kl": 0.029388427734375, "learning_rate": 6.428943380046423e-07, "loss": 0.0518, "num_tokens": 1310969530.0, "reward": 2.291294813156128, "reward_std": 0.45143747329711914, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1804911345243454, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1023.1942138671875, "completions/mean_terminated_length": 769.1336669921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49097011347291036, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13616806672736026, "kl": 0.025421142578125, "learning_rate": 6.425667568907105e-07, "loss": 0.0815, "num_tokens": 1311499345.0, "reward": 2.3325893878936768, "reward_std": 0.448726087808609, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1500372737646103, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 984.7813110351562, "completions/mean_terminated_length": 764.1132202148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49118320814021627, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12283919084358452, "kl": 0.0263671875, "learning_rate": 6.42239124548164e-07, "loss": 0.066, "num_tokens": 1312010639.0, "reward": 2.236049175262451, "reward_std": 0.35402610898017883, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12245608866214752, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 899.6451416015625, "completions/mean_terminated_length": 683.3766479492188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4913963028075222, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13587041197781646, "kl": 0.031402587890625, "learning_rate": 6.419114411583224e-07, "loss": 0.0826, "num_tokens": 1312486080.0, "reward": 2.3755581378936768, "reward_std": 0.415954053401947, "rewards/accuracy_reward/mean": 0.5185185074806213, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813789844513, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 878.8214721679688, "completions/mean_terminated_length": 721.9443359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4916093974748282, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14225775126932688, "kl": 0.028900146484375, "learning_rate": 6.415837069025335e-07, "loss": 0.0645, "num_tokens": 1312954400.0, "reward": 2.392299175262451, "reward_std": 0.46852749586105347, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10481233894824982, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 912.38623046875, "completions/mean_terminated_length": 705.6385498046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49182249214213414, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15059949397848466, "kl": 0.034088134765625, "learning_rate": 6.412559219621728e-07, "loss": 0.0879, "num_tokens": 1313434541.0, "reward": 2.3878350257873535, "reward_std": 0.4745412766933441, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16570167243480682, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 924.5625610351562, "completions/mean_terminated_length": 789.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4920355868094401, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12203006358251214, "kl": 0.027801513671875, "learning_rate": 6.409280865186444e-07, "loss": 0.1011, "num_tokens": 1313915561.0, "reward": 2.4246652126312256, "reward_std": 0.5094877481460571, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1328611820936203, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1003.7344360351562, "completions/mean_terminated_length": 762.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.49224868147674605, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10814766218822099, "kl": 0.024810791015625, "learning_rate": 6.406002007533799e-07, "loss": 0.0687, "num_tokens": 1314436162.0, "reward": 2.439732313156128, "reward_std": 0.43360665440559387, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12782442569732666, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1040.3504638671875, "completions/mean_terminated_length": 758.2085571289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.492461776144052, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1248546146688111, "kl": 0.025543212890625, "learning_rate": 6.402722648478394e-07, "loss": 0.1076, "num_tokens": 1314971919.0, "reward": 2.2862725257873535, "reward_std": 0.5106185674667358, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16907276213169098, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 932.4866333007812, "completions/mean_terminated_length": 756.6563720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49267487081135797, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1378317299999715, "kl": 0.027618408203125, "learning_rate": 6.3994427898351e-07, "loss": 0.0544, "num_tokens": 1315459993.0, "reward": 2.46484375, "reward_std": 0.4626796245574951, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875497817993164, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 897.294677734375, "completions/mean_terminated_length": 715.9173583984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.49288796547866387, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13567760955839048, "kl": 0.028106689453125, "learning_rate": 6.396162433419068e-07, "loss": 0.0805, "num_tokens": 1315933133.0, "reward": 2.463169813156128, "reward_std": 0.3479953706264496, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10171286761760712, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 809.825927734375, "completions/mean_terminated_length": 685.0958251953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4931010601459698, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14932765027112538, "kl": 0.0360107421875, "learning_rate": 6.392881581045722e-07, "loss": 0.1164, "num_tokens": 1316361119.0, "reward": 2.6160714626312256, "reward_std": 0.46530845761299133, "rewards/accuracy_reward/mean": 0.7276785969734192, "rewards/accuracy_reward/std": 0.4456520974636078, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15546400845050812, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1097.2098388671875, "completions/mean_terminated_length": 809.7615966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4933141548132758, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12658388234149398, "kl": 0.0245361328125, "learning_rate": 6.389600234530767e-07, "loss": 0.0777, "num_tokens": 1316928925.0, "reward": 2.2650671005249023, "reward_std": 0.4566171169281006, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17044061422348022, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 871.9174194335938, "completions/mean_terminated_length": 720.833740234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49352724948058174, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1392635750197061, "kl": 0.030517578125, "learning_rate": 6.386318395690178e-07, "loss": 0.1008, "num_tokens": 1317386856.0, "reward": 2.4598214626312256, "reward_std": 0.4671509265899658, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15456202626228333, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 939.8594360351562, "completions/mean_terminated_length": 744.989501953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4937403441478877, "frac_reward_zero_std": 0.0, "grad_norm": 0.13604923951682954, "kl": 0.025909423828125, "learning_rate": 6.383036066340196e-07, "loss": 0.102, "num_tokens": 1317867241.0, "reward": 2.4637277126312256, "reward_std": 0.47998321056365967, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14177079498767853, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 981.9464721679688, "completions/mean_terminated_length": 767.592529296875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.49395343881519366, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11954228812420696, "kl": 0.023834228515625, "learning_rate": 6.379753248297341e-07, "loss": 0.1069, "num_tokens": 1318381681.0, "reward": 2.3716518878936768, "reward_std": 0.4848964810371399, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.15348808467388153, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 813.8013916015625, "completions/mean_terminated_length": 641.0763549804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4941665334824996, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1456615441567663, "kl": 0.03265380859375, "learning_rate": 6.376469943378405e-07, "loss": 0.0554, "num_tokens": 1318815352.0, "reward": 2.4408483505249023, "reward_std": 0.4136618971824646, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 931.2188110351562, "completions/mean_terminated_length": 713.8186645507812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4943796281498056, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14327714887387763, "kl": 0.027862548828125, "learning_rate": 6.373186153400441e-07, "loss": 0.1063, "num_tokens": 1319307274.0, "reward": 2.322544813156128, "reward_std": 0.41019299626350403, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15822990238666534, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1009.1585083007812, "completions/mean_terminated_length": 806.9306640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4945927228171115, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11449580660970503, "kl": 0.0244140625, "learning_rate": 6.36990188018078e-07, "loss": 0.0088, "num_tokens": 1319828593.0, "reward": 2.2544643878936768, "reward_std": 0.41315004229545593, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14619384706020355, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 987.7232666015625, "completions/mean_terminated_length": 814.223388671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49480581748441743, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12978094996689116, "kl": 0.028564453125, "learning_rate": 6.366617125537013e-07, "loss": 0.0457, "num_tokens": 1320343989.0, "reward": 2.3978796005249023, "reward_std": 0.45256125926971436, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1420171558856964, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 917.4285888671875, "completions/mean_terminated_length": 729.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4950189121517234, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13803362656728765, "kl": 0.0286865234375, "learning_rate": 6.363331891287002e-07, "loss": 0.0524, "num_tokens": 1320820373.0, "reward": 2.4056921005249023, "reward_std": 0.48847725987434387, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.17171035706996918, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 951.9754638671875, "completions/mean_terminated_length": 745.5623168945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49523200681902935, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12980912994963212, "kl": 0.027252197265625, "learning_rate": 6.360046179248868e-07, "loss": 0.0789, "num_tokens": 1321311018.0, "reward": 2.3939733505249023, "reward_std": 0.4923938512802124, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17634892463684082, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 999.3750610351562, "completions/mean_terminated_length": 750.254150390625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4954451014863353, "frac_reward_zero_std": 0.0, "grad_norm": 0.13489212029771006, "kl": 0.02777099609375, "learning_rate": 6.356759991241008e-07, "loss": 0.0452, "num_tokens": 1321831906.0, "reward": 2.349888563156128, "reward_std": 0.5136919021606445, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 978.8170166015625, "completions/mean_terminated_length": 810.2894287109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.49565819615364126, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13341273547399218, "kl": 0.028564453125, "learning_rate": 6.353473329082072e-07, "loss": 0.0555, "num_tokens": 1322332976.0, "reward": 2.4029018878936768, "reward_std": 0.4221830666065216, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1569942682981491, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 918.5402221679688, "completions/mean_terminated_length": 753.887451171875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4958712908209472, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11686818221012897, "kl": 0.029022216796875, "learning_rate": 6.350186194590974e-07, "loss": 0.0789, "num_tokens": 1322809298.0, "reward": 2.41015625, "reward_std": 0.408230185508728, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14936910569667816, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1084.9241943359375, "completions/mean_terminated_length": 829.1920776367188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4960843854882532, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1188251701991562, "kl": 0.024169921875, "learning_rate": 6.346898589586897e-07, "loss": 0.0768, "num_tokens": 1323368512.0, "reward": 2.3175225257873535, "reward_std": 0.40248122811317444, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813789844513, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 831.9933471679688, "completions/mean_terminated_length": 679.2286376953125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.49629748015555913, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1572690744195642, "kl": 0.03399658203125, "learning_rate": 6.34361051588927e-07, "loss": 0.0795, "num_tokens": 1323813069.0, "reward": 2.321986675262451, "reward_std": 0.42697685956954956, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13700605928897858, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 935.9732666015625, "completions/mean_terminated_length": 733.5198364257812, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.49651057482286504, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.148269396194928, "kl": 0.02978515625, "learning_rate": 6.3403219753178e-07, "loss": 0.0906, "num_tokens": 1324304689.0, "reward": 2.390625, "reward_std": 0.47408512234687805, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 951.01123046875, "completions/mean_terminated_length": 730.43701171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.496723669490171, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13564127958370173, "kl": 0.026824951171875, "learning_rate": 6.337032969692436e-07, "loss": 0.0765, "num_tokens": 1324800774.0, "reward": 2.3722100257873535, "reward_std": 0.44210124015808105, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15578390657901764, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 953.7835083007812, "completions/mean_terminated_length": 754.5725708007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.49693676415747695, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.25274424419260066, "kl": 0.03350830078125, "learning_rate": 6.333743500833392e-07, "loss": 0.0719, "num_tokens": 1325302645.0, "reward": 2.376674175262451, "reward_std": 0.37470710277557373, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12359261512756348, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 879.247802734375, "completions/mean_terminated_length": 764.6642456054688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4971498588247829, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.137855395878548, "kl": 0.030426025390625, "learning_rate": 6.330453570561138e-07, "loss": 0.044, "num_tokens": 1325765556.0, "reward": 2.463169813156128, "reward_std": 0.41422560811042786, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11752051115036011, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1048.6875, "completions/mean_terminated_length": 790.438232421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49736295349208887, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13968529825377668, "kl": 0.024658203125, "learning_rate": 6.327163180696401e-07, "loss": 0.0946, "num_tokens": 1326309016.0, "reward": 2.3270089626312256, "reward_std": 0.4345180094242096, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13865114748477936, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 929.6317138671875, "completions/mean_terminated_length": 726.0238037109375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.4975760481593948, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1552874175575847, "kl": 0.031646728515625, "learning_rate": 6.323872333060154e-07, "loss": 0.0401, "num_tokens": 1326798563.0, "reward": 2.353236675262451, "reward_std": 0.4689759612083435, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14050592482089996, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 893.5513916015625, "completions/mean_terminated_length": 711.583984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4977891428267008, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.21169147350488637, "kl": 0.035552978515625, "learning_rate": 6.320581029473636e-07, "loss": 0.0795, "num_tokens": 1327278026.0, "reward": 2.4302456378936768, "reward_std": 0.4072837233543396, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962118953466415, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 967.4866333007812, "completions/mean_terminated_length": 793.9326171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49800223749400674, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13081704392667673, "kl": 0.027801513671875, "learning_rate": 6.317289271758325e-07, "loss": 0.0858, "num_tokens": 1327785748.0, "reward": 2.3833706378936768, "reward_std": 0.4488319456577301, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661914110183716, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 930.1964721679688, "completions/mean_terminated_length": 760.6580810546875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.49821533216131264, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1377822455684268, "kl": 0.02850341796875, "learning_rate": 6.313997061735963e-07, "loss": 0.0522, "num_tokens": 1328274940.0, "reward": 2.392857313156128, "reward_std": 0.4331909418106079, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10649171471595764, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 918.3438110351562, "completions/mean_terminated_length": 705.5968017578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4984284268286186, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14210385122744254, "kl": 0.02813720703125, "learning_rate": 6.310704401228532e-07, "loss": 0.1131, "num_tokens": 1328757014.0, "reward": 2.427455425262451, "reward_std": 0.40171927213668823, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12848198413848877, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1033.1160888671875, "completions/mean_terminated_length": 812.4891357421875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.49864152149592456, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12023240623064611, "kl": 0.023101806640625, "learning_rate": 6.30741129205827e-07, "loss": 0.0497, "num_tokens": 1329292602.0, "reward": 2.3175225257873535, "reward_std": 0.39013761281967163, "rewards/accuracy_reward/mean": 0.42592594027519226, "rewards/accuracy_reward/std": 0.4950558841228485, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1506175547838211, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 975.044677734375, "completions/mean_terminated_length": 723.8016357421875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4988546161632305, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13396653816847484, "kl": 0.02801513671875, "learning_rate": 6.304117736047659e-07, "loss": 0.0276, "num_tokens": 1329803662.0, "reward": 2.3705358505249023, "reward_std": 0.40252885222435, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 998.52685546875, "completions/mean_terminated_length": 790.8770141601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49906771083053647, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11945604638024607, "kl": 0.026702880859375, "learning_rate": 6.300823735019432e-07, "loss": 0.0643, "num_tokens": 1330320218.0, "reward": 2.34765625, "reward_std": 0.4534340798854828, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1706821471452713, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 989.6875610351562, "completions/mean_terminated_length": 770.0377197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49928080549784243, "frac_reward_zero_std": 0.0, "grad_norm": 0.13630784708423205, "kl": 0.02728271484375, "learning_rate": 6.297529290796565e-07, "loss": 0.1159, "num_tokens": 1330836766.0, "reward": 2.2700893878936768, "reward_std": 0.47162166237831116, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387789011001587, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1423168033361435, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 823.0402221679688, "completions/mean_terminated_length": 696.3201904296875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4994939001651484, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14460942127160265, "kl": 0.03515625, "learning_rate": 6.294234405202281e-07, "loss": 0.0283, "num_tokens": 1331271696.0, "reward": 2.4542412757873535, "reward_std": 0.47995179891586304, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18162193894386292, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 761.1295166015625, "completions/mean_terminated_length": 665.4628295898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.49970699483245434, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15260264162445966, "kl": 0.03515625, "learning_rate": 6.290939080060047e-07, "loss": 0.0809, "num_tokens": 1331675578.0, "reward": 2.5027902126312256, "reward_std": 0.47969430685043335, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.13839317858219147, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1068.529052734375, "completions/mean_terminated_length": 822.2932739257812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.49992008949976025, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12303925519756481, "kl": 0.02471923828125, "learning_rate": 6.287643317193575e-07, "loss": 0.0618, "num_tokens": 1332226167.0, "reward": 2.196986675262451, "reward_std": 0.4591960906982422, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067808747291565, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.1634487360715866, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 959.2879638671875, "completions/mean_terminated_length": 757.674560546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5001331841670662, "frac_reward_zero_std": 0.0, "grad_norm": 0.1302247454302692, "kl": 0.028167724609375, "learning_rate": 6.284347118426813e-07, "loss": 0.096, "num_tokens": 1332729464.0, "reward": 2.302455425262451, "reward_std": 0.49819159507751465, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3480229377746582, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.2105279266834259, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 987.310302734375, "completions/mean_terminated_length": 826.4344482421875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5003462788343722, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11648255671874307, "kl": 0.027435302734375, "learning_rate": 6.28105048558396e-07, "loss": 0.0277, "num_tokens": 1333237555.0, "reward": 2.306361675262451, "reward_std": 0.45896437764167786, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12691166996955872, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1088.7679443359375, "completions/mean_terminated_length": 844.2577514648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5005593735016781, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12653991344412277, "kl": 0.023590087890625, "learning_rate": 6.277753420489447e-07, "loss": 0.0603, "num_tokens": 1333795339.0, "reward": 2.2645089626312256, "reward_std": 0.4586235582828522, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 854.919677734375, "completions/mean_terminated_length": 718.3980102539062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.500772468168984, "frac_reward_zero_std": 0.0, "grad_norm": 0.14901950347911191, "kl": 0.03173828125, "learning_rate": 6.274455924967946e-07, "loss": 0.0713, "num_tokens": 1334247527.0, "reward": 2.53125, "reward_std": 0.4887952208518982, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 999.841552734375, "completions/mean_terminated_length": 805.7380981445312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.50098556283629, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1216090166935823, "kl": 0.02642822265625, "learning_rate": 6.271158000844374e-07, "loss": 0.0745, "num_tokens": 1334772848.0, "reward": 2.3956475257873535, "reward_std": 0.4995259642601013, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17260229587554932, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 908.4754638671875, "completions/mean_terminated_length": 728.8604736328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5011986575035959, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12732142572870545, "kl": 0.02923583984375, "learning_rate": 6.267859649943872e-07, "loss": 0.0641, "num_tokens": 1335247541.0, "reward": 2.4481027126312256, "reward_std": 0.4220970571041107, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 911.6451416015625, "completions/mean_terminated_length": 742.6487426757812, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.501411752170902, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1387441313573059, "kl": 0.028533935546875, "learning_rate": 6.26456087409183e-07, "loss": 0.1202, "num_tokens": 1335724838.0, "reward": 2.4425225257873535, "reward_std": 0.4729068875312805, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13124457001686096, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 988.6629638671875, "completions/mean_terminated_length": 821.6873779296875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5016248468382078, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1262241091810573, "kl": 0.02801513671875, "learning_rate": 6.261261675113866e-07, "loss": 0.0112, "num_tokens": 1336234591.0, "reward": 2.294642925262451, "reward_std": 0.51183021068573, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.8794642686843872, "rewards/format_reward/std": 0.3259509205818176, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1671055108308792, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 934.0848388671875, "completions/mean_terminated_length": 745.0391845703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5018379415055139, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13346042355819976, "kl": 0.028564453125, "learning_rate": 6.257962054835835e-07, "loss": 0.0434, "num_tokens": 1336722805.0, "reward": 2.420201063156128, "reward_std": 0.4596903920173645, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13230563700199127, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1079.15625, "completions/mean_terminated_length": 858.8438720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5020510361728198, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12441041586216225, "kl": 0.024444580078125, "learning_rate": 6.254662015083822e-07, "loss": 0.084, "num_tokens": 1337282459.0, "reward": 2.3683037757873535, "reward_std": 0.4865238070487976, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13840332627296448, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1033.044677734375, "completions/mean_terminated_length": 802.24658203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5022641308401258, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.123684119602041, "kl": 0.025421142578125, "learning_rate": 6.25136155768415e-07, "loss": 0.0923, "num_tokens": 1337815359.0, "reward": 2.4090402126312256, "reward_std": 0.4522753059864044, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1098259910941124, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 937.1473388671875, "completions/mean_terminated_length": 791.2777709960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5024772255074317, "frac_reward_zero_std": 0.0, "grad_norm": 0.16593296882699007, "kl": 0.02984619140625, "learning_rate": 6.248060684463366e-07, "loss": 0.1772, "num_tokens": 1338304129.0, "reward": 2.4760046005249023, "reward_std": 0.5160124897956848, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.17038200795650482, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1074.984375, "completions/mean_terminated_length": 873.0377197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5026903201747376, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11381415365444424, "kl": 0.023406982421875, "learning_rate": 6.244759397248253e-07, "loss": 0.0653, "num_tokens": 1338861402.0, "reward": 2.5234375, "reward_std": 0.4501384198665619, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.216333270072937, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14262787997722626, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1037.3773193359375, "completions/mean_terminated_length": 821.0108642578125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5029034148420436, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.24337235481109948, "kl": 0.0264892578125, "learning_rate": 6.24145769786582e-07, "loss": 0.0592, "num_tokens": 1339403443.0, "reward": 2.30078125, "reward_std": 0.39265257120132446, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.1038430780172348, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 925.5402221679688, "completions/mean_terminated_length": 748.614990234375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5031165095093495, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15258267265923403, "kl": 0.030731201171875, "learning_rate": 6.238155588143306e-07, "loss": 0.1103, "num_tokens": 1339888053.0, "reward": 2.4949777126312256, "reward_std": 0.47983109951019287, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15968577563762665, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1096.1875, "completions/mean_terminated_length": 853.5686645507812, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5033296041766555, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12329117046407867, "kl": 0.023681640625, "learning_rate": 6.234853069908174e-07, "loss": 0.1047, "num_tokens": 1340450473.0, "reward": 2.3521206378936768, "reward_std": 0.5603268146514893, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.18919269740581512, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1001.3013916015625, "completions/mean_terminated_length": 817.2362060546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5035426988439614, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13599712589471538, "kl": 0.0308837890625, "learning_rate": 6.231550144988116e-07, "loss": 0.042, "num_tokens": 1340972128.0, "reward": 2.4425225257873535, "reward_std": 0.43468981981277466, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10892428457736969, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 931.5089721679688, "completions/mean_terminated_length": 778.4873046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5037557935112674, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1423457383477721, "kl": 0.028076171875, "learning_rate": 6.22824681521105e-07, "loss": 0.063, "num_tokens": 1341454644.0, "reward": 2.446986675262451, "reward_std": 0.43831828236579895, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1477556824684143, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1103.044677734375, "completions/mean_terminated_length": 795.5148315429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5039688881785733, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13752616984853727, "kl": 0.025115966796875, "learning_rate": 6.224943082405112e-07, "loss": 0.0759, "num_tokens": 1342025288.0, "reward": 2.1316964626312256, "reward_std": 0.3737923800945282, "rewards/accuracy_reward/mean": 0.2433035671710968, "rewards/accuracy_reward/std": 0.42955654859542847, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1605832874774933, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1047.94873046875, "completions/mean_terminated_length": 806.9390258789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5041819828458792, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13662966443401997, "kl": 0.02508544921875, "learning_rate": 6.221638948398668e-07, "loss": 0.0395, "num_tokens": 1342565329.0, "reward": 2.345424175262451, "reward_std": 0.3844912648200989, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11214316636323929, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 854.0267944335938, "completions/mean_terminated_length": 686.9312744140625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5043950775131852, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14258892542209364, "kl": 0.03070068359375, "learning_rate": 6.218334415020303e-07, "loss": 0.0649, "num_tokens": 1343016685.0, "reward": 2.4771206378936768, "reward_std": 0.4083729684352875, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1038.0625, "completions/mean_terminated_length": 815.1607666015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5046081721804911, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13867943430506147, "kl": 0.02685546875, "learning_rate": 6.215029484098823e-07, "loss": 0.0755, "num_tokens": 1343545321.0, "reward": 2.3208706378936768, "reward_std": 0.4246404767036438, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.16107910871505737, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 913.7545166015625, "completions/mean_terminated_length": 748.4041137695312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5048212668477972, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13797999918245624, "kl": 0.03057861328125, "learning_rate": 6.211724157463254e-07, "loss": 0.0728, "num_tokens": 1344024971.0, "reward": 2.5044643878936768, "reward_std": 0.42501553893089294, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12337145209312439, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1000.0335083007812, "completions/mean_terminated_length": 754.6419067382812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5050343615151031, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1396032795565846, "kl": 0.025665283203125, "learning_rate": 6.208418436942842e-07, "loss": 0.1175, "num_tokens": 1344545210.0, "reward": 2.44921875, "reward_std": 0.4854455888271332, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1076.82373046875, "completions/mean_terminated_length": 846.1022338867188, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5052474561824091, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13677886177496132, "kl": 0.025909423828125, "learning_rate": 6.20511232436705e-07, "loss": 0.0472, "num_tokens": 1345099723.0, "reward": 2.380580425262451, "reward_std": 0.4701564908027649, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14457522332668304, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 981.6607666015625, "completions/mean_terminated_length": 780.8381958007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.505460550849715, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1531722602951443, "kl": 0.0318603515625, "learning_rate": 6.20180582156556e-07, "loss": 0.117, "num_tokens": 1345611107.0, "reward": 2.3677456378936768, "reward_std": 0.538916826248169, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.19221055507659912, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 980.1339721679688, "completions/mean_terminated_length": 730.0826416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.505673645517021, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13844762772271924, "kl": 0.0267333984375, "learning_rate": 6.198498930368264e-07, "loss": 0.0562, "num_tokens": 1346127151.0, "reward": 2.420201063156128, "reward_std": 0.4536563456058502, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15941192209720612, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1028.4710693359375, "completions/mean_terminated_length": 764.9971923828125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5058867401843269, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11792195244194115, "kl": 0.0281982421875, "learning_rate": 6.195191652605277e-07, "loss": 0.1015, "num_tokens": 1346653682.0, "reward": 2.369419813156128, "reward_std": 0.4254020154476166, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1569942682981491, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 888.1920166015625, "completions/mean_terminated_length": 752.25439453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5060998348516328, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.17919901234290608, "kl": 0.035888671875, "learning_rate": 6.191883990106922e-07, "loss": 0.0487, "num_tokens": 1347114392.0, "reward": 2.544642925262451, "reward_std": 0.35875949263572693, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.11074429750442505, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 885.6942138671875, "completions/mean_terminated_length": 729.7392578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5063129295189388, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13241347842841886, "kl": 0.03155517578125, "learning_rate": 6.188575944703737e-07, "loss": 0.0906, "num_tokens": 1347584879.0, "reward": 2.4542412757873535, "reward_std": 0.4288511574268341, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15750236809253693, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 927.419677734375, "completions/mean_terminated_length": 760.7692260742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5065260241862447, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12657821327459, "kl": 0.029937744140625, "learning_rate": 6.185267518226472e-07, "loss": 0.0782, "num_tokens": 1348071419.0, "reward": 2.439732313156128, "reward_std": 0.39730075001716614, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12198750674724579, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 894.3370971679688, "completions/mean_terminated_length": 691.4619140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5067391188535507, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1377727527660299, "kl": 0.03436279296875, "learning_rate": 6.181958712506091e-07, "loss": 0.0527, "num_tokens": 1348534162.0, "reward": 2.451451063156128, "reward_std": 0.4207465350627899, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.11946304887533188, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1035.65625, "completions/mean_terminated_length": 815.58154296875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5069522135208566, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12292191967576041, "kl": 0.0257568359375, "learning_rate": 6.178649529373762e-07, "loss": 0.0542, "num_tokens": 1349068536.0, "reward": 2.337611675262451, "reward_std": 0.41124528646469116, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1515430212020874, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1014.47998046875, "completions/mean_terminated_length": 819.8381958007812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5071653081881626, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1357666458468901, "kl": 0.025360107421875, "learning_rate": 6.175339970660862e-07, "loss": 0.0693, "num_tokens": 1349590703.0, "reward": 2.427455425262451, "reward_std": 0.4426553547382355, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15374813973903656, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 835.1473388671875, "completions/mean_terminated_length": 689.60498046875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5073784028554685, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13977011980293846, "kl": 0.031951904296875, "learning_rate": 6.172030038198984e-07, "loss": 0.1366, "num_tokens": 1350025937.0, "reward": 2.53515625, "reward_std": 0.42308393120765686, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15578389167785645, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 954.72998046875, "completions/mean_terminated_length": 792.1410522460938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5075914975227744, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13697717831623799, "kl": 0.028228759765625, "learning_rate": 6.168719733819918e-07, "loss": 0.0753, "num_tokens": 1350520584.0, "reward": 2.318638563156128, "reward_std": 0.5162521600723267, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13259780406951904, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 859.7857666015625, "completions/mean_terminated_length": 720.5187377929688, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5078045921900805, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13573834732359238, "kl": 0.030670166015625, "learning_rate": 6.165409059355666e-07, "loss": 0.0844, "num_tokens": 1350973288.0, "reward": 2.6026787757873535, "reward_std": 0.443278431892395, "rewards/accuracy_reward/mean": 0.7075892686843872, "rewards/accuracy_reward/std": 0.4553784728050232, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1249600425362587, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1007.1785888671875, "completions/mean_terminated_length": 756.3434448242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5080176868573864, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14652890677079772, "kl": 0.02813720703125, "learning_rate": 6.16209801663843e-07, "loss": 0.0985, "num_tokens": 1351503736.0, "reward": 2.2589287757873535, "reward_std": 0.477927029132843, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.2068338543176651, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1012.9844360351562, "completions/mean_terminated_length": 767.0967407226562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5082307815246924, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1325062666358616, "kl": 0.027435302734375, "learning_rate": 6.158786607500624e-07, "loss": 0.077, "num_tokens": 1352022545.0, "reward": 2.3939733505249023, "reward_std": 0.3978215157985687, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14553911983966827, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1146.946533203125, "completions/mean_terminated_length": 871.1137084960938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5084438761919983, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11355119041496733, "kl": 0.0230712890625, "learning_rate": 6.155474833774854e-07, "loss": 0.0825, "num_tokens": 1352618105.0, "reward": 2.3002233505249023, "reward_std": 0.4399286210536957, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1478201001882553, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1025.97998046875, "completions/mean_terminated_length": 783.1795654296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5086569708593043, "frac_reward_zero_std": 0.0, "grad_norm": 0.13476401574469568, "kl": 0.027862548828125, "learning_rate": 6.152162697293939e-07, "loss": 0.0697, "num_tokens": 1353152944.0, "reward": 2.3521206378936768, "reward_std": 0.48104655742645264, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16961827874183655, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 915.5670166015625, "completions/mean_terminated_length": 726.828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5088700655266102, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13599772013034264, "kl": 0.030059814453125, "learning_rate": 6.148850199890888e-07, "loss": 0.0472, "num_tokens": 1353632222.0, "reward": 2.5206475257873535, "reward_std": 0.4059658646583557, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 994.9553833007812, "completions/mean_terminated_length": 741.1744995117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5090831601939162, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13624447988477673, "kl": 0.031158447265625, "learning_rate": 6.145537343398917e-07, "loss": 0.0782, "num_tokens": 1354138970.0, "reward": 2.44140625, "reward_std": 0.4164377450942993, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12992529571056366, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 969.8839721679688, "completions/mean_terminated_length": 687.4478759765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5092962548612221, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15657589692656326, "kl": 0.027435302734375, "learning_rate": 6.142224129651437e-07, "loss": 0.105, "num_tokens": 1354643750.0, "reward": 2.4140625, "reward_std": 0.41708022356033325, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14457522332668304, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 970.4219360351562, "completions/mean_terminated_length": 750.271484375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.509509349528528, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13944631200538407, "kl": 0.02764892578125, "learning_rate": 6.13891056048206e-07, "loss": 0.1195, "num_tokens": 1355150803.0, "reward": 2.3638393878936768, "reward_std": 0.5155957937240601, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17786091566085815, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1011.93310546875, "completions/mean_terminated_length": 806.9358520507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.509722444195834, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12983141815870317, "kl": 0.026641845703125, "learning_rate": 6.135596637724592e-07, "loss": 0.0762, "num_tokens": 1355668341.0, "reward": 2.3621652126312256, "reward_std": 0.4232122600078583, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 896.7344360351562, "completions/mean_terminated_length": 697.8245849609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5099355388631399, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12616642169598924, "kl": 0.0284423828125, "learning_rate": 6.132282363213037e-07, "loss": 0.0519, "num_tokens": 1356132110.0, "reward": 2.4927456378936768, "reward_std": 0.3441968262195587, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518539428711, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1033.97998046875, "completions/mean_terminated_length": 810.1771240234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5101486335304459, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12834055217632015, "kl": 0.028045654296875, "learning_rate": 6.12896773878159e-07, "loss": 0.0901, "num_tokens": 1356667909.0, "reward": 2.3588171005249023, "reward_std": 0.4196842312812805, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18230371177196503, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 903.7723388671875, "completions/mean_terminated_length": 743.638671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5103617281977518, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.133934563703041, "kl": 0.02923583984375, "learning_rate": 6.125652766264644e-07, "loss": 0.0956, "num_tokens": 1357138815.0, "reward": 2.4949777126312256, "reward_std": 0.37598463892936707, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13206008076667786, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1075.7879638671875, "completions/mean_terminated_length": 789.1820678710938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5105748228650578, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1359943249877691, "kl": 0.026336669921875, "learning_rate": 6.122337447496781e-07, "loss": 0.1118, "num_tokens": 1357688656.0, "reward": 2.2488839626312256, "reward_std": 0.43310675024986267, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1709425151348114, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 926.5245971679688, "completions/mean_terminated_length": 722.3509521484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5107879175323637, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13621738131296754, "kl": 0.03009033203125, "learning_rate": 6.119021784312776e-07, "loss": 0.1081, "num_tokens": 1358179595.0, "reward": 2.4737725257873535, "reward_std": 0.4345763623714447, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15496434271335602, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 872.3951416015625, "completions/mean_terminated_length": 704.4515380859375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5110010121996698, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12710492159804265, "kl": 0.02874755859375, "learning_rate": 6.115705778547597e-07, "loss": 0.0557, "num_tokens": 1358640892.0, "reward": 2.3978796005249023, "reward_std": 0.38329121470451355, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14827017486095428, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1062.529052734375, "completions/mean_terminated_length": 831.7713623046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5112141068669757, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1184153102791087, "kl": 0.025970458984375, "learning_rate": 6.112389432036395e-07, "loss": 0.0731, "num_tokens": 1359184265.0, "reward": 2.380580425262451, "reward_std": 0.4913359582424164, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1709425151348114, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 910.0067138671875, "completions/mean_terminated_length": 692.0930786132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5114272015342816, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13303574340662616, "kl": 0.0286865234375, "learning_rate": 6.10907274661452e-07, "loss": 0.0502, "num_tokens": 1359662748.0, "reward": 2.4564733505249023, "reward_std": 0.38364139199256897, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 983.044677734375, "completions/mean_terminated_length": 758.5405883789062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5116402962015876, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1399615806852762, "kl": 0.028350830078125, "learning_rate": 6.105755724117497e-07, "loss": 0.09, "num_tokens": 1360173136.0, "reward": 2.467076063156128, "reward_std": 0.4884949326515198, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.18169322609901428, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1043.5, "completions/mean_terminated_length": 818.4480590820312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5118533908688935, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13474856976283414, "kl": 0.0252685546875, "learning_rate": 6.10243836638105e-07, "loss": 0.0628, "num_tokens": 1360712992.0, "reward": 2.2393975257873535, "reward_std": 0.4417692720890045, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.4822677969932556, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.1887698918581009, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 891.5558471679688, "completions/mean_terminated_length": 756.0125122070312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5120664855361995, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14205893620096882, "kl": 0.0302734375, "learning_rate": 6.099120675241078e-07, "loss": 0.0808, "num_tokens": 1361183561.0, "reward": 2.4955358505249023, "reward_std": 0.5023698210716248, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17367035150527954, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 925.49560546875, "completions/mean_terminated_length": 755.2442016601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5122795802035054, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1517751241020233, "kl": 0.029937744140625, "learning_rate": 6.095802652533673e-07, "loss": 0.0806, "num_tokens": 1361669079.0, "reward": 2.4481027126312256, "reward_std": 0.4166407883167267, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1032.8125, "completions/mean_terminated_length": 863.6146240234375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5124926748708114, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.7832429674798569, "kl": 0.055450439453125, "learning_rate": 6.092484300095104e-07, "loss": 0.0586, "num_tokens": 1362205123.0, "reward": 2.349330425262451, "reward_std": 0.44817838072776794, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14969991147518158, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 985.232177734375, "completions/mean_terminated_length": 757.701904296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5127057695381173, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15157403944599057, "kl": 0.028656005859375, "learning_rate": 6.089165619761825e-07, "loss": 0.1251, "num_tokens": 1362718299.0, "reward": 2.27734375, "reward_std": 0.5197800993919373, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9447544813156128, "rewards/tag_count_reward/std": 0.1910770684480667, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 921.7991333007812, "completions/mean_terminated_length": 786.6549682617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5129188642054232, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14860180174126825, "kl": 0.028839111328125, "learning_rate": 6.085846613370473e-07, "loss": 0.1141, "num_tokens": 1363199425.0, "reward": 2.431919813156128, "reward_std": 0.5232210159301758, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18007564544677734, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 930.58935546875, "completions/mean_terminated_length": 702.3010864257812, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5131319588727292, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1408336181338977, "kl": 0.030487060546875, "learning_rate": 6.082527282757862e-07, "loss": 0.0754, "num_tokens": 1363694777.0, "reward": 2.3208706378936768, "reward_std": 0.41732296347618103, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1706821471452713, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 994.2656860351562, "completions/mean_terminated_length": 825.012939453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5133450535400351, "frac_reward_zero_std": 0.0, "grad_norm": 0.1520453280721132, "kl": 0.030548095703125, "learning_rate": 6.079207629760989e-07, "loss": 0.0806, "num_tokens": 1364204112.0, "reward": 2.361607313156128, "reward_std": 0.4760262966156006, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16487936675548553, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 996.357177734375, "completions/mean_terminated_length": 814.65966796875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5135581482073411, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14086777766932815, "kl": 0.02728271484375, "learning_rate": 6.075887656217029e-07, "loss": 0.0983, "num_tokens": 1364719376.0, "reward": 2.4095983505249023, "reward_std": 0.4826275110244751, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1591111123561859, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1041.247802734375, "completions/mean_terminated_length": 791.6629638671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.513771242874647, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13193189319816837, "kl": 0.02691650390625, "learning_rate": 6.072567363963331e-07, "loss": 0.0692, "num_tokens": 1365256735.0, "reward": 2.349888563156128, "reward_std": 0.445743590593338, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16229133307933807, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 809.6406860351562, "completions/mean_terminated_length": 701.4344482421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.513984337541953, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14569900434973557, "kl": 0.0335693359375, "learning_rate": 6.069246754837424e-07, "loss": 0.0381, "num_tokens": 1365686622.0, "reward": 2.4341518878936768, "reward_std": 0.38321271538734436, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12247265130281448, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1078.6004638671875, "completions/mean_terminated_length": 841.6361083984375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.514197432209259, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11673080216470279, "kl": 0.0252685546875, "learning_rate": 6.065925830677007e-07, "loss": 0.1091, "num_tokens": 1366244747.0, "reward": 2.3214287757873535, "reward_std": 0.5216419100761414, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18981274962425232, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 930.8616333007812, "completions/mean_terminated_length": 748.05712890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.514410526876565, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13250732675582602, "kl": 0.02764892578125, "learning_rate": 6.062604593319964e-07, "loss": 0.0402, "num_tokens": 1366729469.0, "reward": 2.4034600257873535, "reward_std": 0.3924359083175659, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 916.5156860351562, "completions/mean_terminated_length": 754.875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5146236215438709, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12954069060231613, "kl": 0.027984619140625, "learning_rate": 6.059283044604342e-07, "loss": 0.088, "num_tokens": 1367207988.0, "reward": 2.4386162757873535, "reward_std": 0.4345270097255707, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15481625497341156, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 993.7076416015625, "completions/mean_terminated_length": 757.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5148367162111768, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1332248091862224, "kl": 0.028533935546875, "learning_rate": 6.055961186368364e-07, "loss": 0.0312, "num_tokens": 1367725537.0, "reward": 2.3839287757873535, "reward_std": 0.46096891164779663, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13840332627296448, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 927.5379638671875, "completions/mean_terminated_length": 727.0342407226562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5150498108784828, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12682075155253925, "kl": 0.02947998046875, "learning_rate": 6.052639020450424e-07, "loss": 0.0662, "num_tokens": 1368211986.0, "reward": 2.44140625, "reward_std": 0.45993292331695557, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1008.44873046875, "completions/mean_terminated_length": 809.3856201171875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5152629055457887, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14025416724026074, "kl": 0.029052734375, "learning_rate": 6.049316548689087e-07, "loss": 0.0925, "num_tokens": 1368729563.0, "reward": 2.345982313156128, "reward_std": 0.450383722782135, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1826944798231125, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 970.3527221679688, "completions/mean_terminated_length": 746.6900024414062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5154760002130947, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1348113331352353, "kl": 0.027862548828125, "learning_rate": 6.045993772923087e-07, "loss": 0.0928, "num_tokens": 1369232809.0, "reward": 2.3113839626312256, "reward_std": 0.4495818614959717, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.1974812150001526, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 868.5469360351562, "completions/mean_terminated_length": 752.9142456054688, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5156890948804006, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14470634778148325, "kl": 0.0338134765625, "learning_rate": 6.042670694991326e-07, "loss": 0.0867, "num_tokens": 1369686254.0, "reward": 2.537388563156128, "reward_std": 0.4362529516220093, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1212.32373046875, "completions/mean_terminated_length": 930.4388427734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5159021895477066, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11663098067290441, "kl": 0.022735595703125, "learning_rate": 6.039347316732874e-07, "loss": 0.0621, "num_tokens": 1370311103.0, "reward": 2.2315850257873535, "reward_std": 0.4198766052722931, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518688440323, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 959.732177734375, "completions/mean_terminated_length": 810.5786743164062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5161152842150125, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12176151342103846, "kl": 0.029266357421875, "learning_rate": 6.036023639986963e-07, "loss": 0.066, "num_tokens": 1370812423.0, "reward": 2.4955358505249023, "reward_std": 0.42358651757240295, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13731664419174194, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1063.2232666015625, "completions/mean_terminated_length": 819.0863647460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5163283788823184, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13180252181338825, "kl": 0.0263671875, "learning_rate": 6.032699666593001e-07, "loss": 0.0941, "num_tokens": 1371356011.0, "reward": 2.2857143878936768, "reward_std": 0.45496076345443726, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14903545379638672, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 976.1428833007812, "completions/mean_terminated_length": 764.064208984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5165414735496244, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.17003535714402265, "kl": 0.02728271484375, "learning_rate": 6.029375398390545e-07, "loss": 0.1389, "num_tokens": 1371861723.0, "reward": 2.330357313156128, "reward_std": 0.4743497371673584, "rewards/accuracy_reward/mean": 0.44675925374031067, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14238695800304413, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 999.6339721679688, "completions/mean_terminated_length": 805.4920654296875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5167545682169303, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1282537811590294, "kl": 0.02587890625, "learning_rate": 6.026050837219327e-07, "loss": 0.0752, "num_tokens": 1372383623.0, "reward": 2.364955425262451, "reward_std": 0.4266434609889984, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13763901591300964, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 945.263427734375, "completions/mean_terminated_length": 723.5335083007812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5169676628842363, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12431075887668937, "kl": 0.029022216796875, "learning_rate": 6.022725984919235e-07, "loss": 0.0535, "num_tokens": 1372877533.0, "reward": 2.3978796005249023, "reward_std": 0.40170004963874817, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14590215682983398, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 878.1964721679688, "completions/mean_terminated_length": 753.9951171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5171807575515422, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13918978694888412, "kl": 0.031768798828125, "learning_rate": 6.019400843330323e-07, "loss": 0.0859, "num_tokens": 1373337493.0, "reward": 2.517857313156128, "reward_std": 0.4215749502182007, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12561768293380737, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 945.8460083007812, "completions/mean_terminated_length": 778.6812133789062, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5173938522188483, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13742465278040455, "kl": 0.029937744140625, "learning_rate": 6.016075414292802e-07, "loss": 0.0524, "num_tokens": 1373823184.0, "reward": 2.4916296005249023, "reward_std": 0.43216538429260254, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.17153577506542206, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1038.1116943359375, "completions/mean_terminated_length": 847.92041015625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5176069468861542, "frac_reward_zero_std": 0.0, "grad_norm": 0.1317303705896818, "kl": 0.02667236328125, "learning_rate": 6.01274969964704e-07, "loss": 0.068, "num_tokens": 1374357874.0, "reward": 2.372767925262451, "reward_std": 0.4326055645942688, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13526484370231628, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1053.7232666015625, "completions/mean_terminated_length": 863.3297729492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5178200415534602, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12317320852729442, "kl": 0.02532958984375, "learning_rate": 6.009423701233567e-07, "loss": 0.0684, "num_tokens": 1374903814.0, "reward": 2.4614956378936768, "reward_std": 0.4561314582824707, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15894921123981476, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1038.88623046875, "completions/mean_terminated_length": 842.4453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5180331362207661, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13316676861104781, "kl": 0.0272216796875, "learning_rate": 6.006097420893069e-07, "loss": 0.0981, "num_tokens": 1375437779.0, "reward": 2.392857313156128, "reward_std": 0.41668689250946045, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18555572628974915, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 842.5111694335938, "completions/mean_terminated_length": 733.9878540039062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.518246230888072, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13232592861974118, "kl": 0.029754638671875, "learning_rate": 6.002770860466386e-07, "loss": 0.0682, "num_tokens": 1375885064.0, "reward": 2.5396206378936768, "reward_std": 0.378508597612381, "rewards/accuracy_reward/mean": 0.6388888955116272, "rewards/accuracy_reward/std": 0.480879545211792, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.1137678399682045, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 930.9420166015625, "completions/mean_terminated_length": 784.257568359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.518459325555378, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1410193043983278, "kl": 0.028961181640625, "learning_rate": 5.999444021794517e-07, "loss": 0.0676, "num_tokens": 1376366958.0, "reward": 2.490513563156128, "reward_std": 0.45106399059295654, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 956.58935546875, "completions/mean_terminated_length": 781.2849731445312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5186724202226839, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13035982669917046, "kl": 0.02764892578125, "learning_rate": 5.99611690671861e-07, "loss": 0.0704, "num_tokens": 1376859382.0, "reward": 2.4676339626312256, "reward_std": 0.44317126274108887, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16545002162456512, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 863.466552734375, "completions/mean_terminated_length": 662.43603515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5188855148899899, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1726558136881807, "kl": 0.03240966796875, "learning_rate": 5.99278951707997e-07, "loss": 0.1274, "num_tokens": 1377306823.0, "reward": 2.38671875, "reward_std": 0.43243926763534546, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1625526398420334, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 870.4910888671875, "completions/mean_terminated_length": 722.5628051757812, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5190986095572958, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1461447707298886, "kl": 0.029937744140625, "learning_rate": 5.989461854720052e-07, "loss": 0.0885, "num_tokens": 1377758819.0, "reward": 2.5111608505249023, "reward_std": 0.4281720519065857, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15274205803871155, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 970.6451416015625, "completions/mean_terminated_length": 781.18896484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5193117042246018, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12710555187313863, "kl": 0.0274658203125, "learning_rate": 5.986133921480463e-07, "loss": 0.0973, "num_tokens": 1378262020.0, "reward": 2.357142925262451, "reward_std": 0.48469728231430054, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.18097810447216034, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1054.5625, "completions/mean_terminated_length": 818.552490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5195247988919077, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13336666132006755, "kl": 0.027008056640625, "learning_rate": 5.982805719202958e-07, "loss": 0.1206, "num_tokens": 1378798256.0, "reward": 2.32421875, "reward_std": 0.5027849674224854, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16989772021770477, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 974.888427734375, "completions/mean_terminated_length": 815.2974853515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5197378935592136, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13637060998037667, "kl": 0.026458740234375, "learning_rate": 5.979477249729442e-07, "loss": 0.0872, "num_tokens": 1379308398.0, "reward": 2.4620537757873535, "reward_std": 0.469237357378006, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 964.263427734375, "completions/mean_terminated_length": 753.2960205078125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5199509882265196, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13622522182208557, "kl": 0.028167724609375, "learning_rate": 5.976148514901971e-07, "loss": 0.0505, "num_tokens": 1379805092.0, "reward": 2.4129464626312256, "reward_std": 0.429694265127182, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1043.19873046875, "completions/mean_terminated_length": 801.0443115234375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5201640828938255, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11635314453987461, "kl": 0.025115966796875, "learning_rate": 5.972819516562743e-07, "loss": 0.0869, "num_tokens": 1380336829.0, "reward": 2.385044813156128, "reward_std": 0.46809110045433044, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.18929573893547058, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 998.4285888671875, "completions/mean_terminated_length": 807.345703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5203771775611316, "frac_reward_zero_std": 0.0, "grad_norm": 0.1350455071354124, "kl": 0.027252197265625, "learning_rate": 5.969490256554104e-07, "loss": 0.1034, "num_tokens": 1380856189.0, "reward": 2.3292412757873535, "reward_std": 0.48902902007102966, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14550481736660004, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1019.87060546875, "completions/mean_terminated_length": 803.1297607421875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5205902722284375, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12401404166913708, "kl": 0.02496337890625, "learning_rate": 5.966160736718543e-07, "loss": 0.0952, "num_tokens": 1381375059.0, "reward": 2.428013563156128, "reward_std": 0.4561253488063812, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12540756165981293, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1068.47998046875, "completions/mean_terminated_length": 818.79833984375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5208033668957435, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1256169204408851, "kl": 0.025054931640625, "learning_rate": 5.962830958898697e-07, "loss": 0.0703, "num_tokens": 1381921338.0, "reward": 2.2377233505249023, "reward_std": 0.4334922432899475, "rewards/accuracy_reward/mean": 0.38461539149284363, "rewards/accuracy_reward/std": 0.4870900511741638, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.16051718592643738, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 840.0267944335938, "completions/mean_terminated_length": 684.8463134765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5210164615630494, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1458769375932124, "kl": 0.032257080078125, "learning_rate": 5.959500924937341e-07, "loss": 0.0467, "num_tokens": 1382357590.0, "reward": 2.4642858505249023, "reward_std": 0.45937252044677734, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14955390989780426, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1016.7366333007812, "completions/mean_terminated_length": 785.6884765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5212295562303554, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13992978756785904, "kl": 0.02557373046875, "learning_rate": 5.956170636677396e-07, "loss": 0.111, "num_tokens": 1382879920.0, "reward": 2.4419643878936768, "reward_std": 0.45922961831092834, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14903545379638672, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1033.1607666015625, "completions/mean_terminated_length": 802.3890380859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5214426508976613, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12265596561269321, "kl": 0.0274658203125, "learning_rate": 5.952840095961919e-07, "loss": 0.0363, "num_tokens": 1383411752.0, "reward": 2.34375, "reward_std": 0.4808152914047241, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14997069537639618, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 933.21435546875, "completions/mean_terminated_length": 733.726318359375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5216557455649672, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15418116188495984, "kl": 0.027130126953125, "learning_rate": 5.949509304634113e-07, "loss": 0.1096, "num_tokens": 1383903720.0, "reward": 2.407924175262451, "reward_std": 0.4646216630935669, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 981.7567138671875, "completions/mean_terminated_length": 770.7887573242188, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5218688402322732, "frac_reward_zero_std": 0.25, "grad_norm": 0.10868035069385899, "kl": 0.028656005859375, "learning_rate": 5.946178264537312e-07, "loss": 0.0395, "num_tokens": 1384408027.0, "reward": 2.377232313156128, "reward_std": 0.36640506982803345, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1012.9910888671875, "completions/mean_terminated_length": 834.1675415039062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5220819348995791, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12215717705497343, "kl": 0.02606201171875, "learning_rate": 5.942846977514993e-07, "loss": 0.044, "num_tokens": 1384929735.0, "reward": 2.4910714626312256, "reward_std": 0.4684024751186371, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14088858664035797, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 958.9241333007812, "completions/mean_terminated_length": 725.7615356445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5222950295668851, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1292888233626093, "kl": 0.0296630859375, "learning_rate": 5.939515445410772e-07, "loss": 0.0758, "num_tokens": 1385434197.0, "reward": 2.302455425262451, "reward_std": 0.440422385931015, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17634892463684082, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 928.76123046875, "completions/mean_terminated_length": 772.1246948242188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.522508124234191, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14005163755878053, "kl": 0.029022216796875, "learning_rate": 5.936183670068391e-07, "loss": 0.0588, "num_tokens": 1385920394.0, "reward": 2.4598214626312256, "reward_std": 0.45283758640289307, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14523428678512573, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 840.419677734375, "completions/mean_terminated_length": 702.23876953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.522721218901497, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14143409357067108, "kl": 0.0313720703125, "learning_rate": 5.932851653331738e-07, "loss": 0.0464, "num_tokens": 1386363126.0, "reward": 2.493861675262451, "reward_std": 0.3823241591453552, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 935.3660888671875, "completions/mean_terminated_length": 746.537841796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5229343135688029, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14328495436049649, "kl": 0.0303955078125, "learning_rate": 5.929519397044825e-07, "loss": 0.0993, "num_tokens": 1386850074.0, "reward": 2.540736675262451, "reward_std": 0.4627631902694702, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561823636293411, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 970.2857666015625, "completions/mean_terminated_length": 728.83056640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.523147408236109, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.21351183756019135, "kl": 0.0328369140625, "learning_rate": 5.926186903051804e-07, "loss": 0.141, "num_tokens": 1387360138.0, "reward": 2.4503350257873535, "reward_std": 0.4587583839893341, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1579248607158661, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 989.7388916015625, "completions/mean_terminated_length": 741.9366455078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5233605029034148, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1227528978890609, "kl": 0.02587890625, "learning_rate": 5.922854173196953e-07, "loss": 0.0493, "num_tokens": 1387873557.0, "reward": 2.3504464626312256, "reward_std": 0.4151289165019989, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1056.1295166015625, "completions/mean_terminated_length": 796.2872924804688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5235735975707208, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12923648512889652, "kl": 0.023681640625, "learning_rate": 5.919521209324684e-07, "loss": 0.0735, "num_tokens": 1388416607.0, "reward": 2.3510046005249023, "reward_std": 0.37309274077415466, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15014436841011047, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1010.8973388671875, "completions/mean_terminated_length": 822.08447265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5237866922380268, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11451177074598225, "kl": 0.026275634765625, "learning_rate": 5.916188013279536e-07, "loss": 0.0636, "num_tokens": 1388944177.0, "reward": 2.3002233505249023, "reward_std": 0.31751203536987305, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10537806153297424, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 966.5402221679688, "completions/mean_terminated_length": 769.6517333984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5239997869053327, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1460635703868407, "kl": 0.030487060546875, "learning_rate": 5.912854586906183e-07, "loss": 0.0747, "num_tokens": 1389441635.0, "reward": 2.44140625, "reward_std": 0.39540570974349976, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1489589959383011, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 982.74560546875, "completions/mean_terminated_length": 736.9176025390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5242128815726387, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13459047144974556, "kl": 0.027740478515625, "learning_rate": 5.909520932049414e-07, "loss": 0.04, "num_tokens": 1389955233.0, "reward": 2.467076063156128, "reward_std": 0.40677663683891296, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11601705849170685, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 853.3192138671875, "completions/mean_terminated_length": 657.8259887695312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5244259762399446, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1495937155902843, "kl": 0.0316162109375, "learning_rate": 5.906187050554156e-07, "loss": 0.1287, "num_tokens": 1390403264.0, "reward": 2.38671875, "reward_std": 0.38554078340530396, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15291257202625275, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 910.1004638671875, "completions/mean_terminated_length": 740.8743896484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5246390709072506, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1500674984180217, "kl": 0.03179931640625, "learning_rate": 5.902852944265456e-07, "loss": 0.0894, "num_tokens": 1390877981.0, "reward": 2.470424175262451, "reward_std": 0.48989835381507874, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18459028005599976, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 877.5022583007812, "completions/mean_terminated_length": 689.4948120117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5248521655745565, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13166518872955504, "kl": 0.029693603515625, "learning_rate": 5.899518615028489e-07, "loss": 0.0941, "num_tokens": 1391334686.0, "reward": 2.4207589626312256, "reward_std": 0.4612061083316803, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 878.9308471679688, "completions/mean_terminated_length": 738.6424560546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5250652602418624, "frac_reward_zero_std": 0.0, "grad_norm": 0.2578161802889121, "kl": 0.031951904296875, "learning_rate": 5.896184064688549e-07, "loss": 0.0979, "num_tokens": 1391801103.0, "reward": 2.4754464626312256, "reward_std": 0.46477192640304565, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14040927588939667, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 905.77685546875, "completions/mean_terminated_length": 762.2813720703125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5252783549091684, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13054103728060573, "kl": 0.030548095703125, "learning_rate": 5.892849295091053e-07, "loss": 0.061, "num_tokens": 1392269291.0, "reward": 2.5440850257873535, "reward_std": 0.4051132798194885, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.07627084106206894, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 998.6027221679688, "completions/mean_terminated_length": 766.9918212890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5254914495764743, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1226467384994295, "kl": 0.0264892578125, "learning_rate": 5.889514308081542e-07, "loss": 0.0469, "num_tokens": 1392786713.0, "reward": 2.286830425262451, "reward_std": 0.3710114657878876, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14409084618091583, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 827.1205444335938, "completions/mean_terminated_length": 720.4417724609375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5257045442437803, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13635280668833064, "kl": 0.03424072265625, "learning_rate": 5.886179105505677e-07, "loss": 0.0737, "num_tokens": 1393221439.0, "reward": 2.5580358505249023, "reward_std": 0.44701412320137024, "rewards/accuracy_reward/mean": 0.6696428656578064, "rewards/accuracy_reward/std": 0.47086748480796814, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 979.7723388671875, "completions/mean_terminated_length": 808.1917114257812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5259176389110862, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13830309111282307, "kl": 0.02899169921875, "learning_rate": 5.882843689209237e-07, "loss": 0.0644, "num_tokens": 1393728201.0, "reward": 2.4402902126312256, "reward_std": 0.4698637127876282, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1514935940504074, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 881.0045166015625, "completions/mean_terminated_length": 721.0609130859375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5261307335783922, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13348446812520479, "kl": 0.030242919921875, "learning_rate": 5.879508061038119e-07, "loss": 0.0604, "num_tokens": 1394194331.0, "reward": 2.5089287757873535, "reward_std": 0.3965337574481964, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1282729059457779, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 940.3348388671875, "completions/mean_terminated_length": 831.740234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5263438282456981, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13311364045332552, "kl": 0.029022216796875, "learning_rate": 5.876172222838339e-07, "loss": 0.0764, "num_tokens": 1394685569.0, "reward": 2.55078125, "reward_std": 0.4826606512069702, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15343420207500458, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1140.5960693359375, "completions/mean_terminated_length": 852.3617553710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5265569229130042, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12436721430394897, "kl": 0.023406982421875, "learning_rate": 5.872836176456025e-07, "loss": 0.071, "num_tokens": 1395267532.0, "reward": 2.3091518878936768, "reward_std": 0.4215041697025299, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1479213982820511, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 931.6942138671875, "completions/mean_terminated_length": 768.9590454101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5267700175803101, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14306292684992372, "kl": 0.02886962890625, "learning_rate": 5.869499923737427e-07, "loss": 0.0963, "num_tokens": 1395756387.0, "reward": 2.4715402126312256, "reward_std": 0.44619134068489075, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15107274055480957, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1059.0379638671875, "completions/mean_terminated_length": 789.321044921875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.526983112247616, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11203685607425817, "kl": 0.026275634765625, "learning_rate": 5.866163466528903e-07, "loss": 0.081, "num_tokens": 1396299780.0, "reward": 2.4174108505249023, "reward_std": 0.4012661874294281, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16431809961795807, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 904.40185546875, "completions/mean_terminated_length": 770.3641357421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.527196206914922, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15022492914136928, "kl": 0.033355712890625, "learning_rate": 5.86282680667693e-07, "loss": 0.0719, "num_tokens": 1396763128.0, "reward": 2.5580358505249023, "reward_std": 0.3992389440536499, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12511979043483734, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1177.0848388671875, "completions/mean_terminated_length": 883.3134155273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5274093015822279, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12062285629686431, "kl": 0.022613525390625, "learning_rate": 5.859489946028088e-07, "loss": 0.0556, "num_tokens": 1397362094.0, "reward": 2.275111675262451, "reward_std": 0.435001403093338, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16169020533561707, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 955.1920166015625, "completions/mean_terminated_length": 749.3845825195312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5276223962495339, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14135841599502916, "kl": 0.031890869140625, "learning_rate": 5.856152886429081e-07, "loss": 0.0588, "num_tokens": 1397859540.0, "reward": 2.506138563156128, "reward_std": 0.3933805823326111, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12381463497877121, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 922.9531860351562, "completions/mean_terminated_length": 700.3502807617188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5278354909168398, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14999812773451962, "kl": 0.02862548828125, "learning_rate": 5.85281562972671e-07, "loss": 0.1251, "num_tokens": 1398347551.0, "reward": 2.4988839626312256, "reward_std": 0.4405926764011383, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16029927134513855, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1084.2723388671875, "completions/mean_terminated_length": 831.8027954101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5280485855841458, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11600180998752851, "kl": 0.024932861328125, "learning_rate": 5.849478177767894e-07, "loss": 0.0435, "num_tokens": 1398903833.0, "reward": 2.3560268878936768, "reward_std": 0.4728037416934967, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1573437601327896, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 944.0870971679688, "completions/mean_terminated_length": 749.9606323242188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5282616802514517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15122149061124673, "kl": 0.028472900390625, "learning_rate": 5.846140532399657e-07, "loss": 0.0984, "num_tokens": 1399387616.0, "reward": 2.3660714626312256, "reward_std": 0.4789182245731354, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1008.1116333007812, "completions/mean_terminated_length": 802.3582763671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5284747749187576, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13843920547244937, "kl": 0.02508544921875, "learning_rate": 5.842802695469131e-07, "loss": 0.0952, "num_tokens": 1399911906.0, "reward": 2.388392925262451, "reward_std": 0.4354220926761627, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1066.7388916015625, "completions/mean_terminated_length": 853.4212036132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5286878695860636, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15343738770341944, "kl": 0.027252197265625, "learning_rate": 5.839464668823552e-07, "loss": 0.1211, "num_tokens": 1400466829.0, "reward": 2.36328125, "reward_std": 0.49735790491104126, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.18215985596179962, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1119.700927734375, "completions/mean_terminated_length": 866.5284423828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5289009642533695, "frac_reward_zero_std": 0.0, "grad_norm": 0.12748076230154207, "kl": 0.0242919921875, "learning_rate": 5.836126454310263e-07, "loss": 0.0493, "num_tokens": 1401039079.0, "reward": 2.3989956378936768, "reward_std": 0.4652915298938751, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 992.8906860351562, "completions/mean_terminated_length": 780.7373046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5291140589206755, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 4.439852843399013, "kl": 0.06658935546875, "learning_rate": 5.832788053776708e-07, "loss": 0.0883, "num_tokens": 1401558470.0, "reward": 2.3543527126312256, "reward_std": 0.4658573269844055, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.8794642686843872, "rewards/format_reward/std": 0.3259509205818176, "rewards/tag_count_reward/mean": 0.9190848469734192, "rewards/tag_count_reward/std": 0.24861730635166168, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 992.8170166015625, "completions/mean_terminated_length": 756.4097900390625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.5293271535879814, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1315104057846089, "kl": 0.029083251953125, "learning_rate": 5.829449469070441e-07, "loss": 0.0998, "num_tokens": 1402073764.0, "reward": 2.3504464626312256, "reward_std": 0.34892064332962036, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1576172560453415, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 898.5469360351562, "completions/mean_terminated_length": 737.6819458007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5295402482552874, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12191515970847414, "kl": 0.029998779296875, "learning_rate": 5.826110702039108e-07, "loss": 0.0693, "num_tokens": 1402543129.0, "reward": 2.408482313156128, "reward_std": 0.4478207528591156, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067808747291565, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 967.122802734375, "completions/mean_terminated_length": 790.251953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5297533429225934, "frac_reward_zero_std": 0.0, "grad_norm": 0.14887562414092487, "kl": 0.03009033203125, "learning_rate": 5.822771754530463e-07, "loss": 0.0744, "num_tokens": 1403051040.0, "reward": 2.4107143878936768, "reward_std": 0.4643322825431824, "rewards/accuracy_reward/mean": 0.5555555820465088, "rewards/accuracy_reward/std": 0.4974800944328308, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 980.7991333007812, "completions/mean_terminated_length": 738.1205444335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5299664375898994, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1335019217940142, "kl": 0.026947021484375, "learning_rate": 5.819432628392358e-07, "loss": 0.0998, "num_tokens": 1403568678.0, "reward": 2.3002233505249023, "reward_std": 0.4801989197731018, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18866156041622162, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1063.82373046875, "completions/mean_terminated_length": 833.369140625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5301795322572053, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1178279770259059, "kl": 0.023895263671875, "learning_rate": 5.816093325472744e-07, "loss": 0.0539, "num_tokens": 1404112359.0, "reward": 2.33984375, "reward_std": 0.35095176100730896, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14921022951602936, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 886.732177734375, "completions/mean_terminated_length": 734.242431640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5303926269245112, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14542665018048415, "kl": 0.030853271484375, "learning_rate": 5.81275384761967e-07, "loss": 0.0727, "num_tokens": 1404570863.0, "reward": 2.4966518878936768, "reward_std": 0.3693641126155853, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14505796134471893, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 972.1295166015625, "completions/mean_terminated_length": 799.3212280273438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5306057215918172, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11472671389309784, "kl": 0.027313232421875, "learning_rate": 5.809414196681281e-07, "loss": 0.0085, "num_tokens": 1405081513.0, "reward": 2.439174175262451, "reward_std": 0.32061702013015747, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.07479990273714066, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 953.2031860351562, "completions/mean_terminated_length": 777.3549194335938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5308188162591231, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12740261748511683, "kl": 0.027496337890625, "learning_rate": 5.806074374505815e-07, "loss": 0.0398, "num_tokens": 1405574644.0, "reward": 2.51171875, "reward_std": 0.4446275234222412, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12265980988740921, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 986.2656860351562, "completions/mean_terminated_length": 762.4405517578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5310319109264291, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1367710784006684, "kl": 0.028106689453125, "learning_rate": 5.802734382941612e-07, "loss": 0.1059, "num_tokens": 1406078715.0, "reward": 2.3872768878936768, "reward_std": 0.4167911410331726, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.1822258085012436, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 874.9866333007812, "completions/mean_terminated_length": 693.5927734375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.531245005593735, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13320480664421974, "kl": 0.0308837890625, "learning_rate": 5.799394223837101e-07, "loss": 0.0668, "num_tokens": 1406537397.0, "reward": 2.5357143878936768, "reward_std": 0.3654972016811371, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11652304977178574, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1083.453125, "completions/mean_terminated_length": 777.0676879882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.531458100261041, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11942950610744035, "kl": 0.0260009765625, "learning_rate": 5.796053899040804e-07, "loss": 0.0575, "num_tokens": 1407086608.0, "reward": 2.385044813156128, "reward_std": 0.41878095269203186, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.146973118185997, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 985.3170166015625, "completions/mean_terminated_length": 768.2096557617188, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5316711949283469, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12442838341438026, "kl": 0.027313232421875, "learning_rate": 5.792713410401335e-07, "loss": 0.0644, "num_tokens": 1407601294.0, "reward": 2.353236675262451, "reward_std": 0.4003566801548004, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13748814165592194, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 932.1897583007812, "completions/mean_terminated_length": 722.0504150390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5318842895956528, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15871092263141104, "kl": 0.031158447265625, "learning_rate": 5.789372759767398e-07, "loss": 0.0622, "num_tokens": 1408088611.0, "reward": 2.4190850257873535, "reward_std": 0.4261649250984192, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992804229259491, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1032.6295166015625, "completions/mean_terminated_length": 811.896728515625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.5320973842629588, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13023573129032154, "kl": 0.02508544921875, "learning_rate": 5.786031948987787e-07, "loss": 0.0744, "num_tokens": 1408622525.0, "reward": 2.415736675262451, "reward_std": 0.46487846970558167, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15423759818077087, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 937.4688110351562, "completions/mean_terminated_length": 748.9973754882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5323104789302647, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13263306961502633, "kl": 0.029449462890625, "learning_rate": 5.782690979911387e-07, "loss": 0.0818, "num_tokens": 1409114831.0, "reward": 2.4369421005249023, "reward_std": 0.4531923234462738, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16174425184726715, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 929.8951416015625, "completions/mean_terminated_length": 779.8709106445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5325235735975707, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.19656560004697857, "kl": 0.03192138671875, "learning_rate": 5.779349854387169e-07, "loss": 0.0683, "num_tokens": 1409607824.0, "reward": 2.5558037757873535, "reward_std": 0.4403499364852905, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15902084112167358, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 910.1920166015625, "completions/mean_terminated_length": 767.251220703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5327366682648766, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12330147726368776, "kl": 0.030487060546875, "learning_rate": 5.776008574264189e-07, "loss": 0.0826, "num_tokens": 1410089926.0, "reward": 2.4854912757873535, "reward_std": 0.3864891231060028, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10307835787534714, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 965.4420166015625, "completions/mean_terminated_length": 794.8062133789062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5329497629321827, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13715560181129868, "kl": 0.02630615234375, "learning_rate": 5.772667141391589e-07, "loss": 0.0905, "num_tokens": 1410587724.0, "reward": 2.3247768878936768, "reward_std": 0.40243247151374817, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15571676194667816, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 951.8839721679688, "completions/mean_terminated_length": 779.1111450195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5331628575994886, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1389484203241223, "kl": 0.028961181640625, "learning_rate": 5.769325557618595e-07, "loss": 0.1085, "num_tokens": 1411075192.0, "reward": 2.4693081378936768, "reward_std": 0.4963549077510834, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15941192209720612, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 985.5000610351562, "completions/mean_terminated_length": 743.8904418945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5333759522667946, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.38685527033596045, "kl": 0.0513916015625, "learning_rate": 5.765983824794523e-07, "loss": 0.0954, "num_tokens": 1411587432.0, "reward": 2.3013393878936768, "reward_std": 0.38494160771369934, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967316269874573, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 895.2745971679688, "completions/mean_terminated_length": 717.0180053710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5335890469341005, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16634039395593334, "kl": 0.032623291015625, "learning_rate": 5.762641944768763e-07, "loss": 0.1091, "num_tokens": 1412054099.0, "reward": 2.447544813156128, "reward_std": 0.5318894386291504, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16929873824119568, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 946.6183471679688, "completions/mean_terminated_length": 773.0155029296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5338021416014064, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1309891704087857, "kl": 0.028106689453125, "learning_rate": 5.759299919390788e-07, "loss": 0.0845, "num_tokens": 1412547960.0, "reward": 2.334263563156128, "reward_std": 0.41789138317108154, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13359208405017853, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1089.1429443359375, "completions/mean_terminated_length": 810.0518188476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5340152362687124, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13884567732794723, "kl": 0.023712158203125, "learning_rate": 5.755957750510157e-07, "loss": 0.0687, "num_tokens": 1413111000.0, "reward": 2.3666296005249023, "reward_std": 0.38791385293006897, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.12189408391714096, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1011.9285888671875, "completions/mean_terminated_length": 793.5135498046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5342283309360183, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1213774906017427, "kl": 0.027923583984375, "learning_rate": 5.752615439976504e-07, "loss": 0.0621, "num_tokens": 1413633480.0, "reward": 2.3565850257873535, "reward_std": 0.4395703077316284, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13669590651988983, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1020.747802734375, "completions/mean_terminated_length": 820.7760009765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5344414256033243, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13448910255368723, "kl": 0.028778076171875, "learning_rate": 5.749272989639539e-07, "loss": 0.0558, "num_tokens": 1414154583.0, "reward": 2.3643975257873535, "reward_std": 0.43220189213752747, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1611565798521042, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1006.66748046875, "completions/mean_terminated_length": 748.509765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5346545202706302, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1304327197273908, "kl": 0.027557373046875, "learning_rate": 5.745930401349054e-07, "loss": 0.1002, "num_tokens": 1414676130.0, "reward": 2.35546875, "reward_std": 0.48487091064453125, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.1728263646364212, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 907.82373046875, "completions/mean_terminated_length": 758.103515625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5348676149379362, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12486998668341937, "kl": 0.030303955078125, "learning_rate": 5.742587676954919e-07, "loss": 0.0173, "num_tokens": 1415152723.0, "reward": 2.4190850257873535, "reward_std": 0.3946661353111267, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.1061379685997963, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1021.2098388671875, "completions/mean_terminated_length": 794.5885620117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5350807096052421, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1453946247051401, "kl": 0.024383544921875, "learning_rate": 5.739244818307069e-07, "loss": 0.0792, "num_tokens": 1415680065.0, "reward": 2.439732313156128, "reward_std": 0.5073918700218201, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16487935185432434, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 991.10498046875, "completions/mean_terminated_length": 801.9763793945312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5352938042725481, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3133943003002054, "kl": 0.02825927734375, "learning_rate": 5.735901827255529e-07, "loss": 0.0875, "num_tokens": 1416199744.0, "reward": 2.3738839626312256, "reward_std": 0.4831084609031677, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.1648755669593811, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 970.3370971679688, "completions/mean_terminated_length": 760.552001953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.535506898939854, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12839339265974306, "kl": 0.02838134765625, "learning_rate": 5.732558705650383e-07, "loss": 0.0244, "num_tokens": 1416706727.0, "reward": 2.4525671005249023, "reward_std": 0.4027446508407593, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1258348822593689, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1109.993408203125, "completions/mean_terminated_length": 836.9711303710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5357199936071599, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13708669443582794, "kl": 0.027130126953125, "learning_rate": 5.729215455341794e-07, "loss": 0.1028, "num_tokens": 1417279876.0, "reward": 2.408482313156128, "reward_std": 0.5162177681922913, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1378791779279709, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 872.9732666015625, "completions/mean_terminated_length": 731.969970703125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.535933088274466, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.21082023499587169, "kl": 0.03302001953125, "learning_rate": 5.725872078179995e-07, "loss": 0.1104, "num_tokens": 1417738520.0, "reward": 2.5306921005249023, "reward_std": 0.39250054955482483, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 917.732177734375, "completions/mean_terminated_length": 749.6410522460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5361461829417719, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.20510272831215312, "kl": 0.037139892578125, "learning_rate": 5.722528576015291e-07, "loss": 0.1151, "num_tokens": 1418221248.0, "reward": 2.454799175262451, "reward_std": 0.4593575596809387, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12870889902114868, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 938.1183471679688, "completions/mean_terminated_length": 749.7572021484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5363592776090779, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1375264190704001, "kl": 0.0279541015625, "learning_rate": 5.719184950698053e-07, "loss": 0.0741, "num_tokens": 1418710645.0, "reward": 2.4849331378936768, "reward_std": 0.38200175762176514, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493200302124, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11638233810663223, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1036.01123046875, "completions/mean_terminated_length": 778.05322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5365723722763838, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12382754785966851, "kl": 0.027618408203125, "learning_rate": 5.71584120407872e-07, "loss": 0.0846, "num_tokens": 1419237770.0, "reward": 2.3738839626312256, "reward_std": 0.439721941947937, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1573437601327896, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 854.5982666015625, "completions/mean_terminated_length": 694.4708862304688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5367854669436898, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1404310411286148, "kl": 0.03167724609375, "learning_rate": 5.712497338007803e-07, "loss": 0.0298, "num_tokens": 1419682262.0, "reward": 2.3074777126312256, "reward_std": 0.3961949050426483, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.8727678656578064, "rewards/format_reward/std": 0.3336053788661957, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18908046185970306, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1084.399658203125, "completions/mean_terminated_length": 858.7631225585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5369985616109957, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12986210209506627, "kl": 0.02398681640625, "learning_rate": 5.709153354335875e-07, "loss": 0.0795, "num_tokens": 1420239897.0, "reward": 2.404017925262451, "reward_std": 0.47819194197654724, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14433756470680237, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1043.734375, "completions/mean_terminated_length": 794.7659912109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5372116562783016, "frac_reward_zero_std": 0.0, "grad_norm": 0.14698619176495026, "kl": 0.026458740234375, "learning_rate": 5.705809254913576e-07, "loss": 0.0623, "num_tokens": 1420780786.0, "reward": 2.466517925262451, "reward_std": 0.4764553904533386, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 976.9910888671875, "completions/mean_terminated_length": 715.1889038085938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5374247509456076, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12205023232259284, "kl": 0.03057861328125, "learning_rate": 5.702465041591605e-07, "loss": 0.0599, "num_tokens": 1421285022.0, "reward": 2.4676339626312256, "reward_std": 0.4549930691719055, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15208269655704498, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 971.6964721679688, "completions/mean_terminated_length": 734.1471557617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5376378456129135, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13106855167259127, "kl": 0.02789306640625, "learning_rate": 5.699120716220734e-07, "loss": 0.0645, "num_tokens": 1421784230.0, "reward": 2.5089287757873535, "reward_std": 0.38742080330848694, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12672585248947144, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 955.2567138671875, "completions/mean_terminated_length": 759.7131958007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5378509402802195, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13571287968399678, "kl": 0.028289794921875, "learning_rate": 5.695776280651785e-07, "loss": 0.0988, "num_tokens": 1422275385.0, "reward": 2.375, "reward_std": 0.4402559697628021, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.11756829917430878, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 956.732177734375, "completions/mean_terminated_length": 771.530029296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5380640349475254, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1294028401861316, "kl": 0.030426025390625, "learning_rate": 5.692431736735653e-07, "loss": 0.0852, "num_tokens": 1422777185.0, "reward": 2.4068081378936768, "reward_std": 0.4592633545398712, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14827017486095428, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 966.5335083007812, "completions/mean_terminated_length": 792.826416015625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5382771296148314, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11906959217067009, "kl": 0.02728271484375, "learning_rate": 5.689087086323281e-07, "loss": 0.0204, "num_tokens": 1423279728.0, "reward": 2.4732143878936768, "reward_std": 0.4347270131111145, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1540280133485794, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1162.80810546875, "completions/mean_terminated_length": 878.1887817382812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5384902242821373, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1141979224608553, "kl": 0.023040771484375, "learning_rate": 5.685742331265682e-07, "loss": 0.0847, "num_tokens": 1423868090.0, "reward": 2.177455425262451, "reward_std": 0.4327274262905121, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45639169216156006, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16713166236877441, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1052.6785888671875, "completions/mean_terminated_length": 822.989013671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.5387033189494433, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11984699791988482, "kl": 0.026153564453125, "learning_rate": 5.682397473413918e-07, "loss": 0.0922, "num_tokens": 1424405194.0, "reward": 2.36328125, "reward_std": 0.4711918234825134, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15252019464969635, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1029.310302734375, "completions/mean_terminated_length": 804.476806640625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5389164136167492, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12966459816387496, "kl": 0.0260009765625, "learning_rate": 5.679052514619116e-07, "loss": 0.078, "num_tokens": 1424940149.0, "reward": 2.303013563156128, "reward_std": 0.44794052839279175, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.175029918551445, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 956.810302734375, "completions/mean_terminated_length": 778.251953125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.5391295082840551, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12046376271080182, "kl": 0.027435302734375, "learning_rate": 5.675707456732451e-07, "loss": 0.0731, "num_tokens": 1425436016.0, "reward": 2.4933037757873535, "reward_std": 0.37164655327796936, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 967.0625610351562, "completions/mean_terminated_length": 735.6422729492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5393426029513612, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13157964400801345, "kl": 0.02764892578125, "learning_rate": 5.672362301605159e-07, "loss": 0.0574, "num_tokens": 1425939740.0, "reward": 2.446986675262451, "reward_std": 0.41433101892471313, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13748814165592194, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1089.984375, "completions/mean_terminated_length": 807.5635375976562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5395556976186671, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13316268841935947, "kl": 0.025665283203125, "learning_rate": 5.669017051088526e-07, "loss": 0.0839, "num_tokens": 1426494133.0, "reward": 2.314732313156128, "reward_std": 0.40007105469703674, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1572524905204773, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 915.3438110351562, "completions/mean_terminated_length": 753.5357055664062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5397687922859731, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12801515974999175, "kl": 0.02911376953125, "learning_rate": 5.665671707033892e-07, "loss": 0.0738, "num_tokens": 1426969071.0, "reward": 2.431919813156128, "reward_std": 0.4664011299610138, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14505796134471893, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 930.5982666015625, "completions/mean_terminated_length": 741.0548706054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.539981886953279, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 1.3898405126493263, "kl": 0.029632568359375, "learning_rate": 5.662326271292649e-07, "loss": 0.0772, "num_tokens": 1427470523.0, "reward": 2.3995537757873535, "reward_std": 0.42330262064933777, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 922.7991333007812, "completions/mean_terminated_length": 748.7989501953125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.540194981620585, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1219167225201827, "kl": 0.02716064453125, "learning_rate": 5.658980745716241e-07, "loss": 0.0858, "num_tokens": 1427951329.0, "reward": 2.462611675262451, "reward_std": 0.41727176308631897, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14825333654880524, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1041.90625, "completions/mean_terminated_length": 778.3380126953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5404080762878909, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11989181049298381, "kl": 0.025115966796875, "learning_rate": 5.655635132156159e-07, "loss": 0.0759, "num_tokens": 1428488407.0, "reward": 2.3270089626312256, "reward_std": 0.424572616815567, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1662929803133011, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1071.59375, "completions/mean_terminated_length": 856.0926513671875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5406211709551968, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11491931818942336, "kl": 0.025115966796875, "learning_rate": 5.652289432463944e-07, "loss": 0.0358, "num_tokens": 1429041425.0, "reward": 2.3705358505249023, "reward_std": 0.4363420605659485, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1027.915283203125, "completions/mean_terminated_length": 782.0775146484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.5408342656225028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1373722455798989, "kl": 0.028717041015625, "learning_rate": 5.648943648491184e-07, "loss": 0.0627, "num_tokens": 1429569467.0, "reward": 2.2578125, "reward_std": 0.5009745359420776, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15822990238666534, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1226.310302734375, "completions/mean_terminated_length": 856.682861328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5410473602898087, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1193030656405214, "kl": 0.024658203125, "learning_rate": 5.645597782089517e-07, "loss": 0.1012, "num_tokens": 1430190854.0, "reward": 2.2979912757873535, "reward_std": 0.4786623418331146, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.16274166107177734, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1093.63623046875, "completions/mean_terminated_length": 863.6370849609375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5412604549571147, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10841522890703569, "kl": 0.023223876953125, "learning_rate": 5.642251835110621e-07, "loss": 0.0728, "num_tokens": 1430754931.0, "reward": 2.3314733505249023, "reward_std": 0.38782799243927, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.49470317363739014, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 936.33935546875, "completions/mean_terminated_length": 774.2813110351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5414735496244206, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13340307722619693, "kl": 0.029937744140625, "learning_rate": 5.638905809406222e-07, "loss": 0.0992, "num_tokens": 1431241675.0, "reward": 2.377232313156128, "reward_std": 0.4577689468860626, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13058780133724213, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1007.7589721679688, "completions/mean_terminated_length": 815.1216430664062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5416866442917266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12328841874383709, "kl": 0.025299072265625, "learning_rate": 5.635559706828093e-07, "loss": 0.0867, "num_tokens": 1431756543.0, "reward": 2.3599331378936768, "reward_std": 0.4778337776660919, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15551920235157013, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 901.0513916015625, "completions/mean_terminated_length": 727.092529296875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5418997389590325, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1473653153540454, "kl": 0.03179931640625, "learning_rate": 5.632213529228038e-07, "loss": 0.1122, "num_tokens": 1432219062.0, "reward": 2.564732313156128, "reward_std": 0.5007344484329224, "rewards/accuracy_reward/mean": 0.6919642686843872, "rewards/accuracy_reward/std": 0.462197482585907, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 971.3750610351562, "completions/mean_terminated_length": 754.8954467773438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5421128336263386, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13695829674396554, "kl": 0.027191162109375, "learning_rate": 5.628867278457918e-07, "loss": 0.0469, "num_tokens": 1432727310.0, "reward": 2.35546875, "reward_std": 0.4119955599308014, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14590215682983398, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 974.8995971679688, "completions/mean_terminated_length": 748.6784057617188, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5423259282936445, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1262898319965193, "kl": 0.0294189453125, "learning_rate": 5.625520956369622e-07, "loss": 0.0746, "num_tokens": 1433233777.0, "reward": 2.439732313156128, "reward_std": 0.35909226536750793, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1007.185302734375, "completions/mean_terminated_length": 773.9972534179688, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5425390229609504, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13804189209290182, "kl": 0.028045654296875, "learning_rate": 5.622174564815085e-07, "loss": 0.0842, "num_tokens": 1433755668.0, "reward": 2.4090402126312256, "reward_std": 0.39986953139305115, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15675851702690125, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1051.3929443359375, "completions/mean_terminated_length": 811.2132568359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5427521176282564, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1355460551757965, "kl": 0.024139404296875, "learning_rate": 5.618828105646277e-07, "loss": 0.0769, "num_tokens": 1434296500.0, "reward": 2.3777902126312256, "reward_std": 0.4623502492904663, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12956929206848145, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 961.3839721679688, "completions/mean_terminated_length": 749.8560180664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5429652122955623, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 2150.53406387813, "kl": 105.60092163085938, "learning_rate": 5.615481580715209e-07, "loss": 4.3181, "num_tokens": 1434796752.0, "reward": 2.3744421005249023, "reward_std": 0.42382705211639404, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15122967958450317, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 886.1004638671875, "completions/mean_terminated_length": 706.4252319335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5431783069628683, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14041413351707122, "kl": 0.029632568359375, "learning_rate": 5.612134991873925e-07, "loss": 0.0755, "num_tokens": 1435261293.0, "reward": 2.5033483505249023, "reward_std": 0.4114845395088196, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11055814474821091, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1136.0982666015625, "completions/mean_terminated_length": 842.890869140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5433914016301742, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11273338321191699, "kl": 0.0240478515625, "learning_rate": 5.608788340974506e-07, "loss": 0.073, "num_tokens": 1435843113.0, "reward": 2.25390625, "reward_std": 0.468479722738266, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.1660102754831314, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1001.5692138671875, "completions/mean_terminated_length": 780.9702758789062, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5436044962974802, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1375104936446683, "kl": 0.029449462890625, "learning_rate": 5.605441629869066e-07, "loss": 0.1008, "num_tokens": 1436357800.0, "reward": 2.4464287757873535, "reward_std": 0.4158378541469574, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13833114504814148, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1074.529052734375, "completions/mean_terminated_length": 829.8016357421875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5438175909647861, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12140846818449216, "kl": 0.0244140625, "learning_rate": 5.602094860409758e-07, "loss": 0.0958, "num_tokens": 1436912869.0, "reward": 2.3275671005249023, "reward_std": 0.4429693818092346, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16174425184726715, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 980.6607666015625, "completions/mean_terminated_length": 799.5195922851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.544030685632092, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13010839400105428, "kl": 0.028778076171875, "learning_rate": 5.598748034448758e-07, "loss": 0.0877, "num_tokens": 1437414717.0, "reward": 2.53515625, "reward_std": 0.46613243222236633, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1122.1138916015625, "completions/mean_terminated_length": 849.1647338867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.544243780299398, "frac_reward_zero_std": 0.0, "grad_norm": 0.14159495696811322, "kl": 0.023284912109375, "learning_rate": 5.595401153838279e-07, "loss": 0.0718, "num_tokens": 1437994736.0, "reward": 2.2963171005249023, "reward_std": 0.4919103682041168, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14469929039478302, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1055.493408203125, "completions/mean_terminated_length": 884.0131225585938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5444568749667039, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13486247681306315, "kl": 0.027191162109375, "learning_rate": 5.592054220430563e-07, "loss": 0.0822, "num_tokens": 1438539565.0, "reward": 2.4029018878936768, "reward_std": 0.5142381191253662, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1728886514902115, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 975.7232666015625, "completions/mean_terminated_length": 780.5066528320312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5446699696340099, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12594702108622743, "kl": 0.028594970703125, "learning_rate": 5.588707236077883e-07, "loss": 0.0936, "num_tokens": 1439046705.0, "reward": 2.45703125, "reward_std": 0.4303736090660095, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 980.7589721679688, "completions/mean_terminated_length": 719.8778076171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.5448830643013158, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12738095522427592, "kl": 0.02789306640625, "learning_rate": 5.58536020263254e-07, "loss": 0.0643, "num_tokens": 1439554021.0, "reward": 2.34375, "reward_std": 0.46259820461273193, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.19045621156692505, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1004.5960083007812, "completions/mean_terminated_length": 749.5416870117188, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5450961589686218, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13022701076071544, "kl": 0.02978515625, "learning_rate": 5.582013121946854e-07, "loss": 0.1265, "num_tokens": 1440083824.0, "reward": 2.359375, "reward_std": 0.5142177939414978, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.315234512090683, "rewards/tag_count_reward/mean": 0.9397321343421936, "rewards/tag_count_reward/std": 0.1906527429819107, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 926.9777221679688, "completions/mean_terminated_length": 773.3350219726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5453092536359277, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12026737910838516, "kl": 0.027313232421875, "learning_rate": 5.578665995873186e-07, "loss": 0.0705, "num_tokens": 1440566870.0, "reward": 2.4503350257873535, "reward_std": 0.38047823309898376, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14517304301261902, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1194.37060546875, "completions/mean_terminated_length": 964.6401977539062, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5455223483032338, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1238930668961791, "kl": 0.024200439453125, "learning_rate": 5.57531882626391e-07, "loss": 0.0637, "num_tokens": 1441176556.0, "reward": 2.3080358505249023, "reward_std": 0.5359495878219604, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18524597585201263, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1019.6272583007812, "completions/mean_terminated_length": 812.8499145507812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5457354429705397, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13529000791082935, "kl": 0.028045654296875, "learning_rate": 5.571971614971429e-07, "loss": 0.1024, "num_tokens": 1441698005.0, "reward": 2.3973214626312256, "reward_std": 0.4668022692203522, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1010.3035888671875, "completions/mean_terminated_length": 767.3168334960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5459485376378456, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11849813685526632, "kl": 0.029266357421875, "learning_rate": 5.568624363848166e-07, "loss": 0.074, "num_tokens": 1442223165.0, "reward": 2.25390625, "reward_std": 0.39514774084091187, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1098.828125, "completions/mean_terminated_length": 866.808349609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5461616323051516, "frac_reward_zero_std": 0.0, "grad_norm": 0.12414305753363285, "kl": 0.025146484375, "learning_rate": 5.565277074746574e-07, "loss": 0.0952, "num_tokens": 1442782432.0, "reward": 2.275669813156128, "reward_std": 0.5509776473045349, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.1878126710653305, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1068.5067138671875, "completions/mean_terminated_length": 787.0430908203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5463747269724575, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1301431766941673, "kl": 0.02874755859375, "learning_rate": 5.561929749519114e-07, "loss": 0.0767, "num_tokens": 1443332947.0, "reward": 2.40234375, "reward_std": 0.44990086555480957, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 856.0491333007812, "completions/mean_terminated_length": 706.3065185546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5465878216397635, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1418762295400127, "kl": 0.029876708984375, "learning_rate": 5.558582390018282e-07, "loss": 0.0778, "num_tokens": 1443775513.0, "reward": 2.5457589626312256, "reward_std": 0.36205828189849854, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 961.2857666015625, "completions/mean_terminated_length": 789.994873046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5468009163070694, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13569106289844665, "kl": 0.029510498046875, "learning_rate": 5.555234998096576e-07, "loss": 0.0727, "num_tokens": 1444271929.0, "reward": 2.5150671005249023, "reward_std": 0.3758191764354706, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11660738289356232, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1035.805908203125, "completions/mean_terminated_length": 798.7906494140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5470140109743754, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1165296335371031, "kl": 0.02587890625, "learning_rate": 5.55188757560653e-07, "loss": 0.0466, "num_tokens": 1444808098.0, "reward": 2.3839287757873535, "reward_std": 0.3771636188030243, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1007.0558471679688, "completions/mean_terminated_length": 784.1978759765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5472271056416813, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12479448805958689, "kl": 0.0252685546875, "learning_rate": 5.548540124400683e-07, "loss": 0.0978, "num_tokens": 1445326411.0, "reward": 2.428013563156128, "reward_std": 0.4119296967983246, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1168.419677734375, "completions/mean_terminated_length": 882.1657104492188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5474402003089872, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12915338114329536, "kl": 0.0235595703125, "learning_rate": 5.54519264633159e-07, "loss": 0.0897, "num_tokens": 1445921511.0, "reward": 2.189174175262451, "reward_std": 0.44451647996902466, "rewards/accuracy_reward/mean": 0.3147321343421936, "rewards/accuracy_reward/std": 0.4649282991886139, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.17231273651123047, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1067.40185546875, "completions/mean_terminated_length": 879.6276245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5476532949762932, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11660486962641035, "kl": 0.02398681640625, "learning_rate": 5.541845143251828e-07, "loss": 0.0745, "num_tokens": 1446472635.0, "reward": 2.4112725257873535, "reward_std": 0.43731269240379333, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16288256645202637, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1103.21435546875, "completions/mean_terminated_length": 838.6742553710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5478663896435991, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11840129100858982, "kl": 0.023468017578125, "learning_rate": 5.538497617013983e-07, "loss": 0.0534, "num_tokens": 1447054363.0, "reward": 2.30859375, "reward_std": 0.4556789696216583, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16595762968063354, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1004.6451416015625, "completions/mean_terminated_length": 804.8536987304688, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5480794843109051, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14144215124798631, "kl": 0.027496337890625, "learning_rate": 5.535150069470652e-07, "loss": 0.0951, "num_tokens": 1447574524.0, "reward": 2.377232313156128, "reward_std": 0.44333258271217346, "rewards/accuracy_reward/mean": 0.5069444179534912, "rewards/accuracy_reward/std": 0.5005314350128174, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1036.263427734375, "completions/mean_terminated_length": 832.8311157226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.548292578978211, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13532153314087433, "kl": 0.026092529296875, "learning_rate": 5.531802502474449e-07, "loss": 0.0817, "num_tokens": 1448107938.0, "reward": 2.318080425262451, "reward_std": 0.5077657699584961, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960493743419647, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15610112249851227, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1060.1785888671875, "completions/mean_terminated_length": 858.3656005859375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5485056736455171, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12089921734551706, "kl": 0.024444580078125, "learning_rate": 5.528454917877995e-07, "loss": 0.0821, "num_tokens": 1448655026.0, "reward": 2.4793527126312256, "reward_std": 0.5006406903266907, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1070.607177734375, "completions/mean_terminated_length": 811.0734252929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.548718768312823, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13688520262825593, "kl": 0.02423095703125, "learning_rate": 5.525107317533922e-07, "loss": 0.0883, "num_tokens": 1449203522.0, "reward": 2.40234375, "reward_std": 0.449735164642334, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11317373067140579, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 976.7120971679688, "completions/mean_terminated_length": 781.6754760742188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.548931862980129, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12134551453067012, "kl": 0.027252197265625, "learning_rate": 5.521759703294871e-07, "loss": 0.0299, "num_tokens": 1449709745.0, "reward": 2.4994421005249023, "reward_std": 0.4723702371120453, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13463464379310608, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1012.57373046875, "completions/mean_terminated_length": 741.3211059570312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5491449576474349, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15234245257056114, "kl": 0.027496337890625, "learning_rate": 5.518412077013489e-07, "loss": 0.0598, "num_tokens": 1450238322.0, "reward": 2.4564733505249023, "reward_std": 0.42672479152679443, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15116052329540253, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 911.7031860351562, "completions/mean_terminated_length": 701.2777709960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5493580523147408, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1445336646711588, "kl": 0.02880859375, "learning_rate": 5.515064440542433e-07, "loss": 0.0767, "num_tokens": 1450712221.0, "reward": 2.4765625, "reward_std": 0.40348440408706665, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559476852417, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1015.3058471679688, "completions/mean_terminated_length": 810.9759521484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5495711469820468, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12032728502922249, "kl": 0.027740478515625, "learning_rate": 5.51171679573436e-07, "loss": 0.0877, "num_tokens": 1451244470.0, "reward": 2.5379464626312256, "reward_std": 0.42562028765678406, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.15048591792583466, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1058.8616943359375, "completions/mean_terminated_length": 827.2451782226562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5497842416493527, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11493643529843638, "kl": 0.026611328125, "learning_rate": 5.508369144441939e-07, "loss": 0.0684, "num_tokens": 1451792216.0, "reward": 2.424107313156128, "reward_std": 0.4627886116504669, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 902.0313110351562, "completions/mean_terminated_length": 738.3214111328125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5499973363166587, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1381902211240128, "kl": 0.031005859375, "learning_rate": 5.505021488517836e-07, "loss": 0.0766, "num_tokens": 1452265366.0, "reward": 2.5401787757873535, "reward_std": 0.455746591091156, "rewards/accuracy_reward/mean": 0.6629464030265808, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12286446243524551, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 989.30810546875, "completions/mean_terminated_length": 715.7135009765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5502104309839646, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12238713164235202, "kl": 0.0269775390625, "learning_rate": 5.501673829814725e-07, "loss": 0.0695, "num_tokens": 1452779120.0, "reward": 2.3660714626312256, "reward_std": 0.4281170070171356, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15813913941383362, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 921.8928833007812, "completions/mean_terminated_length": 754.4205322265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5504235256512706, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14369608573527254, "kl": 0.029876708984375, "learning_rate": 5.498326170185274e-07, "loss": 0.1103, "num_tokens": 1453259616.0, "reward": 2.424107313156128, "reward_std": 0.4437844753265381, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1168653815984726, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 971.1964721679688, "completions/mean_terminated_length": 781.8372802734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5506366203185765, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12703118156672616, "kl": 0.0264892578125, "learning_rate": 5.494978511482165e-07, "loss": 0.0723, "num_tokens": 1453762264.0, "reward": 2.529017925262451, "reward_std": 0.41542086005210876, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12155692279338837, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1008.6920166015625, "completions/mean_terminated_length": 786.184326171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5508497149858825, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12812316691405934, "kl": 0.025390625, "learning_rate": 5.491630855558062e-07, "loss": 0.1043, "num_tokens": 1454290974.0, "reward": 2.4029018878936768, "reward_std": 0.4100467264652252, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13607001304626465, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 856.7835083007812, "completions/mean_terminated_length": 733.55419921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5510628096531884, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.20322389100892088, "kl": 0.0384521484375, "learning_rate": 5.488283204265642e-07, "loss": 0.0712, "num_tokens": 1454736029.0, "reward": 2.4916296005249023, "reward_std": 0.39276954531669617, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151521027088165, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 964.9888916015625, "completions/mean_terminated_length": 747.2252197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5512759043204943, "frac_reward_zero_std": 0.0, "grad_norm": 0.14209145193041803, "kl": 0.03094482421875, "learning_rate": 5.48493555945757e-07, "loss": 0.0818, "num_tokens": 1455236184.0, "reward": 2.3895089626312256, "reward_std": 0.4565965235233307, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1114.9866943359375, "completions/mean_terminated_length": 836.434814453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5514889989878003, "frac_reward_zero_std": 0.0, "grad_norm": 0.1324779333317211, "kl": 0.02508544921875, "learning_rate": 5.481587922986511e-07, "loss": 0.1283, "num_tokens": 1455812034.0, "reward": 2.3314733505249023, "reward_std": 0.5204752087593079, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1599874496459961, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1023.2656860351562, "completions/mean_terminated_length": 765.6508178710938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5517020936551063, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 1.2567054797592596, "kl": 0.163177490234375, "learning_rate": 5.47824029670513e-07, "loss": 0.072, "num_tokens": 1456345289.0, "reward": 2.294642925262451, "reward_std": 0.39642608165740967, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 977.732177734375, "completions/mean_terminated_length": 765.9679565429688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5519151883224123, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.4967322351541157, "kl": 0.08843994140625, "learning_rate": 5.474892682466078e-07, "loss": 0.0844, "num_tokens": 1456861937.0, "reward": 2.4481027126312256, "reward_std": 0.45657193660736084, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16229133307933807, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 954.919677734375, "completions/mean_terminated_length": 789.131103515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5521282829897182, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12856782879075673, "kl": 0.027008056640625, "learning_rate": 5.471545082122006e-07, "loss": 0.0541, "num_tokens": 1457358989.0, "reward": 2.4598214626312256, "reward_std": 0.4691341519355774, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.13043475151062012, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 857.935302734375, "completions/mean_terminated_length": 721.7586669921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5523413776570242, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13654891552826218, "kl": 0.03204345703125, "learning_rate": 5.468197497525552e-07, "loss": 0.0511, "num_tokens": 1457812544.0, "reward": 2.5636162757873535, "reward_std": 0.3479025661945343, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1312885582447052, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 943.0647583007812, "completions/mean_terminated_length": 788.4300537109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5525544723243301, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14404815460757897, "kl": 0.02978515625, "learning_rate": 5.464849930529349e-07, "loss": 0.1286, "num_tokens": 1458303741.0, "reward": 2.4916296005249023, "reward_std": 0.4349367618560791, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13903217017650604, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 974.0692138671875, "completions/mean_terminated_length": 768.4228515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.552767566991636, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14537344691061183, "kl": 0.028778076171875, "learning_rate": 5.461502382986018e-07, "loss": 0.0999, "num_tokens": 1458804380.0, "reward": 2.4185268878936768, "reward_std": 0.4646940529346466, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15339045226573944, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1093.9888916015625, "completions/mean_terminated_length": 854.1536254882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.552980661658942, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14030659575507484, "kl": 0.024139404296875, "learning_rate": 5.458154856748172e-07, "loss": 0.0673, "num_tokens": 1459369063.0, "reward": 2.310826063156128, "reward_std": 0.530843198299408, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16573180258274078, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 948.794677734375, "completions/mean_terminated_length": 768.9246826171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5531937563262479, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13749057975563722, "kl": 0.02874755859375, "learning_rate": 5.454807353668411e-07, "loss": 0.0599, "num_tokens": 1459862555.0, "reward": 2.4659600257873535, "reward_std": 0.38980361819267273, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12715734541416168, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 868.6361694335938, "completions/mean_terminated_length": 693.2435913085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5534068509935539, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1332445726900636, "kl": 0.033447265625, "learning_rate": 5.451459875599317e-07, "loss": 0.0849, "num_tokens": 1460318840.0, "reward": 2.541294813156128, "reward_std": 0.43197789788246155, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11709482222795486, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 896.2813110351562, "completions/mean_terminated_length": 758.0750122070312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5536199456608598, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1343735938852152, "kl": 0.0291748046875, "learning_rate": 5.44811242439347e-07, "loss": 0.0724, "num_tokens": 1460785622.0, "reward": 2.4296875, "reward_std": 0.42164888978004456, "rewards/accuracy_reward/mean": 0.5509259104728699, "rewards/accuracy_reward/std": 0.49797651171684265, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585102021694183, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 990.8281860351562, "completions/mean_terminated_length": 778.2600708007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5538330403281658, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13371481214003675, "kl": 0.026824951171875, "learning_rate": 5.444765001903424e-07, "loss": 0.1133, "num_tokens": 1461305305.0, "reward": 2.3677456378936768, "reward_std": 0.43860164284706116, "rewards/accuracy_reward/mean": 0.5046296119689941, "rewards/accuracy_reward/std": 0.5005582571029663, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14989471435546875, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 963.6138916015625, "completions/mean_terminated_length": 792.68994140625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5540461349954717, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13496517540196407, "kl": 0.0272216796875, "learning_rate": 5.44141760998172e-07, "loss": 0.0771, "num_tokens": 1461803852.0, "reward": 2.427455425262451, "reward_std": 0.37594443559646606, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 969.46435546875, "completions/mean_terminated_length": 756.064208984375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5542592296627777, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12995735936192493, "kl": 0.027374267578125, "learning_rate": 5.438070250480887e-07, "loss": 0.0668, "num_tokens": 1462303708.0, "reward": 2.4771206378936768, "reward_std": 0.4451123774051666, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.152151420712471, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 962.2366333007812, "completions/mean_terminated_length": 781.2760620117188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5544723243300836, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11735627642728928, "kl": 0.027984619140625, "learning_rate": 5.434722925253427e-07, "loss": 0.0621, "num_tokens": 1462804486.0, "reward": 2.4447546005249023, "reward_std": 0.4166436493396759, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12783296406269073, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 997.888427734375, "completions/mean_terminated_length": 762.6174926757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5546854189973895, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13765768979067447, "kl": 0.027313232421875, "learning_rate": 5.431375636151833e-07, "loss": 0.1218, "num_tokens": 1463319332.0, "reward": 2.3426339626312256, "reward_std": 0.5067855715751648, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16460275650024414, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1075.1138916015625, "completions/mean_terminated_length": 847.3030395507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5548985136646956, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12972409661348047, "kl": 0.025390625, "learning_rate": 5.428028385028572e-07, "loss": 0.0592, "num_tokens": 1463871143.0, "reward": 2.2767858505249023, "reward_std": 0.4311223328113556, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.4834881126880646, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1378791630268097, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1051.0357666015625, "completions/mean_terminated_length": 817.5867919921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5551116083320015, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12029987376112536, "kl": 0.02606201171875, "learning_rate": 5.42468117373609e-07, "loss": 0.0492, "num_tokens": 1464405447.0, "reward": 2.4760046005249023, "reward_std": 0.4058417081832886, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1046096533536911, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 874.2410888671875, "completions/mean_terminated_length": 720.111083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5553247029993075, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.18953896596032319, "kl": 0.03857421875, "learning_rate": 5.421334004126814e-07, "loss": 0.0674, "num_tokens": 1464873171.0, "reward": 2.4324777126312256, "reward_std": 0.4040633738040924, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12015077471733093, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 857.2098388671875, "completions/mean_terminated_length": 676.6015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5555377976666134, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14002389938369492, "kl": 0.0338134765625, "learning_rate": 5.417986878053145e-07, "loss": 0.072, "num_tokens": 1465323761.0, "reward": 2.4732143878936768, "reward_std": 0.4370638132095337, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 906.6629638671875, "completions/mean_terminated_length": 746.933837890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5557508923339194, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13729975978229228, "kl": 0.033843994140625, "learning_rate": 5.414639797367462e-07, "loss": 0.0774, "num_tokens": 1465799994.0, "reward": 2.46484375, "reward_std": 0.38406994938850403, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1273927539587021, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1026.9085693359375, "completions/mean_terminated_length": 741.0028686523438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5559639870012253, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12521871723959077, "kl": 0.026275634765625, "learning_rate": 5.411292763922115e-07, "loss": 0.015, "num_tokens": 1466340705.0, "reward": 2.3878350257873535, "reward_std": 0.35273706912994385, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11214316636323929, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 883.1763916015625, "completions/mean_terminated_length": 749.8880615234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5561770816685312, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13786148696566777, "kl": 0.030609130859375, "learning_rate": 5.407945779569437e-07, "loss": 0.0956, "num_tokens": 1466805696.0, "reward": 2.4034600257873535, "reward_std": 0.36062392592430115, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1070.52685546875, "completions/mean_terminated_length": 803.9431762695312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5563901763358372, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13069413153063825, "kl": 0.025604248046875, "learning_rate": 5.404598846161722e-07, "loss": 0.0844, "num_tokens": 1467355820.0, "reward": 2.2885046005249023, "reward_std": 0.3684141933917999, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11517468094825745, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1013.3660888671875, "completions/mean_terminated_length": 781.5628051757812, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5566032710031431, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15154086931701127, "kl": 0.028900146484375, "learning_rate": 5.401251965551243e-07, "loss": 0.1087, "num_tokens": 1467886624.0, "reward": 2.3214287757873535, "reward_std": 0.5204406976699829, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.8705357313156128, "rewards/format_reward/std": 0.3360883891582489, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.18038389086723328, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 886.4285888671875, "completions/mean_terminated_length": 717.0946044921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5568163656704491, "frac_reward_zero_std": 0.0, "grad_norm": 0.1384663688452525, "kl": 0.033721923828125, "learning_rate": 5.397905139590243e-07, "loss": 0.0376, "num_tokens": 1468353136.0, "reward": 2.4966518878936768, "reward_std": 0.46916860342025757, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559476852417, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 995.7723388671875, "completions/mean_terminated_length": 790.9386596679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.557029460337755, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12180172590545481, "kl": 0.02587890625, "learning_rate": 5.394558370130933e-07, "loss": 0.0596, "num_tokens": 1468871386.0, "reward": 2.3286831378936768, "reward_std": 0.3538344204425812, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174957811832428, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561822891235352, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 993.8058471679688, "completions/mean_terminated_length": 768.1111450195312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.557242555005061, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12623495953964037, "kl": 0.029510498046875, "learning_rate": 5.391211659025495e-07, "loss": 0.0694, "num_tokens": 1469386195.0, "reward": 2.4207589626312256, "reward_std": 0.42882055044174194, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12805373966693878, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 846.0982666015625, "completions/mean_terminated_length": 681.3705444335938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5574556496723669, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14415830208165778, "kl": 0.03057861328125, "learning_rate": 5.387865008126077e-07, "loss": 0.0034, "num_tokens": 1469828367.0, "reward": 2.43359375, "reward_std": 0.4682982563972473, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14052370190620422, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1030.9598388671875, "completions/mean_terminated_length": 799.6876831054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.557668744339673, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12576256610199696, "kl": 0.029083251953125, "learning_rate": 5.384518419284791e-07, "loss": 0.0854, "num_tokens": 1470358717.0, "reward": 2.3588171005249023, "reward_std": 0.4639336168766022, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16087746620178223, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 922.872802734375, "completions/mean_terminated_length": 787.8574829101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5578818390069789, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15170371406206454, "kl": 0.028228759765625, "learning_rate": 5.381171894353725e-07, "loss": 0.0855, "num_tokens": 1470842164.0, "reward": 2.3738839626312256, "reward_std": 0.423240065574646, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14311718940734863, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 987.9531860351562, "completions/mean_terminated_length": 791.6481323242188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5580949336742848, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13135331044535922, "kl": 0.029388427734375, "learning_rate": 5.377825435184915e-07, "loss": 0.094, "num_tokens": 1471353263.0, "reward": 2.46484375, "reward_std": 0.4773964285850525, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16172881424427032, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 963.669677734375, "completions/mean_terminated_length": 779.6448974609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5583080283415908, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12757267422164323, "kl": 0.02850341796875, "learning_rate": 5.37447904363038e-07, "loss": 0.085, "num_tokens": 1471850395.0, "reward": 2.349888563156128, "reward_std": 0.4307153820991516, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15628795325756073, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 951.2031860351562, "completions/mean_terminated_length": 761.7042236328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5585211230088967, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12403225478166431, "kl": 0.027679443359375, "learning_rate": 5.371132721542084e-07, "loss": 0.0562, "num_tokens": 1472343990.0, "reward": 2.4441964626312256, "reward_std": 0.41627421975135803, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13376134634017944, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 1004.5535888671875, "completions/mean_terminated_length": 774.256103515625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5587342176762027, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14373088519809513, "kl": 0.027008056640625, "learning_rate": 5.367786470771962e-07, "loss": 0.0864, "num_tokens": 1472863694.0, "reward": 2.3370537757873535, "reward_std": 0.4103930592536926, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 984.7388916015625, "completions/mean_terminated_length": 797.7611694335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5589473123435086, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11361073484305183, "kl": 0.027099609375, "learning_rate": 5.364440293171909e-07, "loss": 0.0689, "num_tokens": 1473373945.0, "reward": 2.4799108505249023, "reward_std": 0.399619460105896, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11244472116231918, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 981.6138916015625, "completions/mean_terminated_length": 835.4593505859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5591604070108146, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12396197335137026, "kl": 0.027862548828125, "learning_rate": 5.361094190593777e-07, "loss": 0.0614, "num_tokens": 1473882316.0, "reward": 2.459263563156128, "reward_std": 0.47173482179641724, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 968.3370971679688, "completions/mean_terminated_length": 758.1626586914062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5593735016781205, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12625989178642208, "kl": 0.028839111328125, "learning_rate": 5.35774816488938e-07, "loss": 0.0839, "num_tokens": 1474378611.0, "reward": 2.4441964626312256, "reward_std": 0.4054144322872162, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13731664419174194, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 979.9732666015625, "completions/mean_terminated_length": 718.9000244140625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.5595865963454264, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12840186531315384, "kl": 0.027801513671875, "learning_rate": 5.354402217910483e-07, "loss": 0.0974, "num_tokens": 1474887527.0, "reward": 2.4419643878936768, "reward_std": 0.40829336643218994, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11244472116231918, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 939.3973388671875, "completions/mean_terminated_length": 767.9638671875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5597996910127324, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.6444096761549296, "kl": 0.031646728515625, "learning_rate": 5.351056351508816e-07, "loss": 0.0831, "num_tokens": 1475382601.0, "reward": 2.388951063156128, "reward_std": 0.46781638264656067, "rewards/accuracy_reward/mean": 0.5324074029922485, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.146371990442276, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1135.79248046875, "completions/mean_terminated_length": 893.5678100585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5600127856800383, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12710418831407644, "kl": 0.023223876953125, "learning_rate": 5.347710567536057e-07, "loss": 0.0808, "num_tokens": 1475965836.0, "reward": 2.3582589626312256, "reward_std": 0.42983826994895935, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16906259953975677, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 963.94873046875, "completions/mean_terminated_length": 710.107421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5602258803473443, "frac_reward_zero_std": 0.25, "grad_norm": 0.13288478611508414, "kl": 0.028656005859375, "learning_rate": 5.344364867843841e-07, "loss": 0.0931, "num_tokens": 1476463781.0, "reward": 2.3671875, "reward_std": 0.3562367558479309, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14969991147518158, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1043.43310546875, "completions/mean_terminated_length": 814.9972534179688, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5604389750146502, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11385585983768896, "kl": 0.026763916015625, "learning_rate": 5.34101925428376e-07, "loss": 0.0431, "num_tokens": 1476998135.0, "reward": 2.357701063156128, "reward_std": 0.331063449382782, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13748814165592194, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 989.1920166015625, "completions/mean_terminated_length": 783.0773315429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5606520696819562, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12358929205034586, "kl": 0.0283203125, "learning_rate": 5.33767372870735e-07, "loss": 0.0443, "num_tokens": 1477515053.0, "reward": 2.5027902126312256, "reward_std": 0.4237518608570099, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15314915776252747, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1027.5625, "completions/mean_terminated_length": 802.3433227539062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5608651643492621, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11991277810461956, "kl": 0.027099609375, "learning_rate": 5.334328292966108e-07, "loss": 0.0976, "num_tokens": 1478053193.0, "reward": 2.474888563156128, "reward_std": 0.38539546728134155, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.216333270072937, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12015077471733093, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1137.546875, "completions/mean_terminated_length": 882.6199951171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5610782590165682, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12505912323811832, "kl": 0.0263671875, "learning_rate": 5.330982948911475e-07, "loss": 0.0945, "num_tokens": 1478637982.0, "reward": 2.2918527126312256, "reward_std": 0.4079464077949524, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1100.790283203125, "completions/mean_terminated_length": 856.005615234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5612913536838741, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1154762700102432, "kl": 0.0245361328125, "learning_rate": 5.327637698394842e-07, "loss": 0.0654, "num_tokens": 1479203792.0, "reward": 2.404576063156128, "reward_std": 0.451653391122818, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.1657317876815796, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 991.700927734375, "completions/mean_terminated_length": 782.7005615234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.56150444835118, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.134487181394703, "kl": 0.030548095703125, "learning_rate": 5.32429254326755e-07, "loss": 0.0707, "num_tokens": 1479717194.0, "reward": 2.48046875, "reward_std": 0.425826758146286, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1120.5379638671875, "completions/mean_terminated_length": 893.8250122070312, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.561717543018486, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11188389292700854, "kl": 0.0257568359375, "learning_rate": 5.320947485380883e-07, "loss": 0.0675, "num_tokens": 1480289723.0, "reward": 2.2935268878936768, "reward_std": 0.4769846200942993, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17032796144485474, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 936.02685546875, "completions/mean_terminated_length": 764.0721435546875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5619306376857919, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.28855193647952737, "kl": 0.032989501953125, "learning_rate": 5.317602526586082e-07, "loss": 0.0892, "num_tokens": 1480780599.0, "reward": 2.5184152126312256, "reward_std": 0.47805073857307434, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1524137556552887, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1002.7076416015625, "completions/mean_terminated_length": 792.5281982421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5621437323530979, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13759937677542167, "kl": 0.02496337890625, "learning_rate": 5.314257668734318e-07, "loss": 0.0781, "num_tokens": 1481304612.0, "reward": 2.431361675262451, "reward_std": 0.33426597714424133, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1327671855688095, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1036.946533203125, "completions/mean_terminated_length": 813.79833984375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5623568270204038, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12781642366808002, "kl": 0.02801513671875, "learning_rate": 5.310912913676721e-07, "loss": 0.0691, "num_tokens": 1481838332.0, "reward": 2.353794813156128, "reward_std": 0.4601910412311554, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14259286224842072, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 858.4330444335938, "completions/mean_terminated_length": 698.8202514648438, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5625699216877098, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1435897039478035, "kl": 0.0328369140625, "learning_rate": 5.307568263264349e-07, "loss": 0.0824, "num_tokens": 1482283166.0, "reward": 2.4732143878936768, "reward_std": 0.41550031304359436, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1012.9732666015625, "completions/mean_terminated_length": 811.4879760742188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5627830163550157, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13128086621221471, "kl": 0.02618408203125, "learning_rate": 5.304223719348215e-07, "loss": 0.0897, "num_tokens": 1482803442.0, "reward": 2.450892925262451, "reward_std": 0.39558514952659607, "rewards/accuracy_reward/mean": 0.5532407164573669, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13422717154026031, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1149.013427734375, "completions/mean_terminated_length": 834.9096069335938, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.5629961110223217, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13171506506147648, "kl": 0.0260009765625, "learning_rate": 5.300879283779268e-07, "loss": 0.1374, "num_tokens": 1483385144.0, "reward": 2.3119421005249023, "reward_std": 0.4740091860294342, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16142748296260834, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 918.6317138671875, "completions/mean_terminated_length": 720.0288696289062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5632092056896276, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11980488117574017, "kl": 0.028472900390625, "learning_rate": 5.297534958408394e-07, "loss": 0.0837, "num_tokens": 1483860579.0, "reward": 2.3643975257873535, "reward_std": 0.4041427671909332, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.13000212609767914, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1109.872802734375, "completions/mean_terminated_length": 847.1971435546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5634223003569335, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10327414510349373, "kl": 0.025238037109375, "learning_rate": 5.294190745086426e-07, "loss": 0.0299, "num_tokens": 1484417754.0, "reward": 2.255580425262451, "reward_std": 0.4205310046672821, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1599874496459961, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1009.47998046875, "completions/mean_terminated_length": 773.3233032226562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5636353950242395, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11110738201786279, "kl": 0.028289794921875, "learning_rate": 5.290846645664125e-07, "loss": 0.0707, "num_tokens": 1484932257.0, "reward": 2.484375, "reward_std": 0.373627632856369, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14480386674404144, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 971.9085083007812, "completions/mean_terminated_length": 799.0647583007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5638484896915454, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12044673954833522, "kl": 0.027191162109375, "learning_rate": 5.287502661992197e-07, "loss": 0.0644, "num_tokens": 1485439784.0, "reward": 2.5362725257873535, "reward_std": 0.42643916606903076, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10724954307079315, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1066.075927734375, "completions/mean_terminated_length": 765.4869384765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5640615843588515, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13605337010110938, "kl": 0.02734375, "learning_rate": 5.284158795921281e-07, "loss": 0.0833, "num_tokens": 1485984090.0, "reward": 2.2611608505249023, "reward_std": 0.36337438225746155, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1045.3504638671875, "completions/mean_terminated_length": 800.2583618164062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5642746790261574, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1342160202590979, "kl": 0.028778076171875, "learning_rate": 5.280815049301949e-07, "loss": 0.0668, "num_tokens": 1486524439.0, "reward": 2.4174108505249023, "reward_std": 0.5006897449493408, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14903545379638672, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 991.5558471679688, "completions/mean_terminated_length": 818.68310546875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5644877736934634, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11171373908048758, "kl": 0.026885986328125, "learning_rate": 5.277471423984709e-07, "loss": 0.0426, "num_tokens": 1487044176.0, "reward": 2.3828125, "reward_std": 0.3635784089565277, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.1118156760931015, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1209.109375, "completions/mean_terminated_length": 895.168701171875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5647008683607693, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1228380250720558, "kl": 0.023193359375, "learning_rate": 5.274127921820004e-07, "loss": 0.0854, "num_tokens": 1487659137.0, "reward": 2.2779018878936768, "reward_std": 0.48724567890167236, "rewards/accuracy_reward/mean": 0.41898149251937866, "rewards/accuracy_reward/std": 0.49396437406539917, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16289502382278442, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 895.294677734375, "completions/mean_terminated_length": 696.1361083984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5649139630280752, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13537411244193284, "kl": 0.029296875, "learning_rate": 5.270784544658207e-07, "loss": 0.0844, "num_tokens": 1488129525.0, "reward": 2.396205425262451, "reward_std": 0.37333884835243225, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11468179523944855, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1062.8504638671875, "completions/mean_terminated_length": 845.4196166992188, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5651270576953812, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12137100123020736, "kl": 0.0257568359375, "learning_rate": 5.267441294349619e-07, "loss": 0.0601, "num_tokens": 1488673474.0, "reward": 2.431361675262451, "reward_std": 0.4382311701774597, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13170984387397766, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1058.85498046875, "completions/mean_terminated_length": 820.4736328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5653401523626871, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.122792074999492, "kl": 0.023193359375, "learning_rate": 5.264098172744471e-07, "loss": 0.0966, "num_tokens": 1489218801.0, "reward": 2.33984375, "reward_std": 0.41300156712532043, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15291257202625275, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 901.3638916015625, "completions/mean_terminated_length": 685.4190673828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5655532470299931, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.16285266573177537, "kl": 0.031463623046875, "learning_rate": 5.26075518169293e-07, "loss": 0.0759, "num_tokens": 1489691300.0, "reward": 2.431361675262451, "reward_std": 0.41083529591560364, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1017.69873046875, "completions/mean_terminated_length": 845.9818115234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.565766341697299, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15405397292111547, "kl": 0.02685546875, "learning_rate": 5.257412323045081e-07, "loss": 0.0605, "num_tokens": 1490225421.0, "reward": 2.5323662757873535, "reward_std": 0.45641008019447327, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12956565618515015, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1095.587158203125, "completions/mean_terminated_length": 842.6864624023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.565979436364605, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11830537465485597, "kl": 0.02398681640625, "learning_rate": 5.254069598650947e-07, "loss": 0.0805, "num_tokens": 1490781476.0, "reward": 2.3364956378936768, "reward_std": 0.38298436999320984, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14517304301261902, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 933.4464721679688, "completions/mean_terminated_length": 757.7674560546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5661925310319109, "frac_reward_zero_std": 0.0, "grad_norm": 0.13954503308964486, "kl": 0.028961181640625, "learning_rate": 5.250727010360463e-07, "loss": 0.1245, "num_tokens": 1491266412.0, "reward": 2.494419813156128, "reward_std": 0.4833642542362213, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585100531578064, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1095.9710693359375, "completions/mean_terminated_length": 876.27197265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5664056256992169, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12008218210858618, "kl": 0.025604248046875, "learning_rate": 5.247384560023498e-07, "loss": 0.0799, "num_tokens": 1491835007.0, "reward": 2.2840402126312256, "reward_std": 0.4294620454311371, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.13851940631866455, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1010.6406860351562, "completions/mean_terminated_length": 818.5369873046875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5666187203665228, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13290692354104014, "kl": 0.027862548828125, "learning_rate": 5.244042249489844e-07, "loss": 0.1051, "num_tokens": 1492352286.0, "reward": 2.3364956378936768, "reward_std": 0.41447001695632935, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780035495758057, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 890.2545166015625, "completions/mean_terminated_length": 738.227294921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5668318150338287, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.20374180216549947, "kl": 0.030120849609375, "learning_rate": 5.240700080609212e-07, "loss": 0.0196, "num_tokens": 1492831584.0, "reward": 2.4073662757873535, "reward_std": 0.38919177651405334, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1334015429019928, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1008.1027221679688, "completions/mean_terminated_length": 792.27490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5670449097011347, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10491118610204067, "kl": 0.026336669921875, "learning_rate": 5.237358055231238e-07, "loss": 0.019, "num_tokens": 1493354734.0, "reward": 2.4425225257873535, "reward_std": 0.3357571065425873, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.09263478219509125, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 924.6563110351562, "completions/mean_terminated_length": 760.8951416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5672580043684406, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13595540863504654, "kl": 0.02752685546875, "learning_rate": 5.234016175205477e-07, "loss": 0.058, "num_tokens": 1493836452.0, "reward": 2.51171875, "reward_std": 0.4025529623031616, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 993.2232666015625, "completions/mean_terminated_length": 787.893310546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5674710990357467, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1372326715577458, "kl": 0.02716064453125, "learning_rate": 5.230674442381405e-07, "loss": 0.0613, "num_tokens": 1494354856.0, "reward": 2.4029018878936768, "reward_std": 0.45049983263015747, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 931.3125610351562, "completions/mean_terminated_length": 771.7857055664062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5676841937030526, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12982180455272763, "kl": 0.029327392578125, "learning_rate": 5.227332858608413e-07, "loss": 0.0851, "num_tokens": 1494843188.0, "reward": 2.4888393878936768, "reward_std": 0.3818797469139099, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09768717736005783, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 914.4754638671875, "completions/mean_terminated_length": 701.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5678972883703586, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13302626754717684, "kl": 0.029632568359375, "learning_rate": 5.223991425735812e-07, "loss": 0.0859, "num_tokens": 1495320793.0, "reward": 2.4441964626312256, "reward_std": 0.39099055528640747, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12155693024396896, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 845.4777221679688, "completions/mean_terminated_length": 687.5706787109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5681103830376645, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13556864439769312, "kl": 0.031341552734375, "learning_rate": 5.220650145612832e-07, "loss": 0.0788, "num_tokens": 1495759631.0, "reward": 2.5106027126312256, "reward_std": 0.3736709952354431, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1117081567645073, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1170.1317138671875, "completions/mean_terminated_length": 891.2794189453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5683234777049704, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12414931320693118, "kl": 0.022705078125, "learning_rate": 5.217309020088611e-07, "loss": 0.0774, "num_tokens": 1496351306.0, "reward": 2.275669813156128, "reward_std": 0.4311106204986572, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.18299148976802826, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1125.200927734375, "completions/mean_terminated_length": 853.1618041992188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5685365723722764, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12326672749206961, "kl": 0.02325439453125, "learning_rate": 5.213968051012212e-07, "loss": 0.0622, "num_tokens": 1496929204.0, "reward": 2.3404018878936768, "reward_std": 0.48235875368118286, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18239015340805054, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1079.29248046875, "completions/mean_terminated_length": 896.8567504882812, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5687496670395823, "frac_reward_zero_std": 0.0, "grad_norm": 0.12728705641042465, "kl": 0.025177001953125, "learning_rate": 5.210627240232603e-07, "loss": 0.1041, "num_tokens": 1497482951.0, "reward": 2.390625, "reward_std": 0.5176438093185425, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14942027628421783, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1069.8638916015625, "completions/mean_terminated_length": 781.5115356445312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5689627617068883, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13078165031966787, "kl": 0.02484130859375, "learning_rate": 5.207286589598666e-07, "loss": 0.0806, "num_tokens": 1498035338.0, "reward": 2.3565850257873535, "reward_std": 0.46218574047088623, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15448831021785736, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1009.0692138671875, "completions/mean_terminated_length": 800.1689453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5691758563741942, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12732452105413625, "kl": 0.026824951171875, "learning_rate": 5.203946100959197e-07, "loss": 0.0688, "num_tokens": 1498555321.0, "reward": 2.3984375, "reward_std": 0.39427533745765686, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1251746416091919, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1025.805908203125, "completions/mean_terminated_length": 826.8186645507812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5693889510415002, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11646350761656792, "kl": 0.025634765625, "learning_rate": 5.200605776162898e-07, "loss": 0.05, "num_tokens": 1499077314.0, "reward": 2.4112725257873535, "reward_std": 0.37816962599754333, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09811913222074509, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1059.872802734375, "completions/mean_terminated_length": 841.7847290039062, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5696020457088061, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12594720320446293, "kl": 0.0244140625, "learning_rate": 5.197265617058389e-07, "loss": 0.0404, "num_tokens": 1499620281.0, "reward": 2.467076063156128, "reward_std": 0.3397904932498932, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10366258770227432, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1003.9620971679688, "completions/mean_terminated_length": 763.0302124023438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5698151403761121, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13434399127226226, "kl": 0.027496337890625, "learning_rate": 5.193925625494185e-07, "loss": 0.1021, "num_tokens": 1500141208.0, "reward": 2.349330425262451, "reward_std": 0.4278067946434021, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 982.6942138671875, "completions/mean_terminated_length": 833.6055908203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.570028235043418, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14180966152504046, "kl": 0.02752685546875, "learning_rate": 5.190585803318721e-07, "loss": 0.095, "num_tokens": 1500656063.0, "reward": 2.400669813156128, "reward_std": 0.492755651473999, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399912416934967, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1058.9866943359375, "completions/mean_terminated_length": 850.4918823242188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5702413297107239, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14256061389495808, "kl": 0.03021240234375, "learning_rate": 5.187246152380331e-07, "loss": 0.0885, "num_tokens": 1501198153.0, "reward": 2.2583706378936768, "reward_std": 0.3884522318840027, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 906.0178833007812, "completions/mean_terminated_length": 759.3148193359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.57045442437803, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1448747971532215, "kl": 0.03021240234375, "learning_rate": 5.183906674527256e-07, "loss": 0.068, "num_tokens": 1501680561.0, "reward": 2.40625, "reward_std": 0.4149862825870514, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13058778643608093, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 958.1719360351562, "completions/mean_terminated_length": 717.6375732421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5706675190453359, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13589082611372647, "kl": 0.030731201171875, "learning_rate": 5.180567371607641e-07, "loss": 0.0673, "num_tokens": 1502188846.0, "reward": 2.4916296005249023, "reward_std": 0.46413901448249817, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1477556824684143, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1067.9285888671875, "completions/mean_terminated_length": 845.0630493164062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5708806137126419, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12868667996485622, "kl": 0.0279541015625, "learning_rate": 5.177228245469537e-07, "loss": 0.0583, "num_tokens": 1502737710.0, "reward": 2.3325893878936768, "reward_std": 0.419612318277359, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14619383215904236, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 959.2545166015625, "completions/mean_terminated_length": 764.4263305664062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5710937083799478, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14187563472226494, "kl": 0.029876708984375, "learning_rate": 5.173889297960893e-07, "loss": 0.0961, "num_tokens": 1503233248.0, "reward": 2.4135046005249023, "reward_std": 0.46477067470550537, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1590748131275177, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1069.8973388671875, "completions/mean_terminated_length": 747.73291015625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5713068030472538, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12958603274778882, "kl": 0.024810791015625, "learning_rate": 5.170550530929561e-07, "loss": 0.0662, "num_tokens": 1503782290.0, "reward": 2.2455358505249023, "reward_std": 0.38819533586502075, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16025643050670624, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1049.2232666015625, "completions/mean_terminated_length": 879.718017578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5715198977145597, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11571750499002063, "kl": 0.02734375, "learning_rate": 5.167211946223292e-07, "loss": 0.0853, "num_tokens": 1504319478.0, "reward": 2.3091518878936768, "reward_std": 0.46825939416885376, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14213687181472778, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 914.591552734375, "completions/mean_terminated_length": 742.6864013671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5717329923818656, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14757016144715496, "kl": 0.031219482421875, "learning_rate": 5.163873545689739e-07, "loss": 0.1482, "num_tokens": 1504806719.0, "reward": 2.489955425262451, "reward_std": 0.4557664394378662, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1194591298699379, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 901.0357666015625, "completions/mean_terminated_length": 706.3812255859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5719460870491716, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1361870645356449, "kl": 0.028472900390625, "learning_rate": 5.160535331176449e-07, "loss": 0.0767, "num_tokens": 1505278543.0, "reward": 2.4933037757873535, "reward_std": 0.4082051217556, "rewards/accuracy_reward/mean": 0.5810185074806213, "rewards/accuracy_reward/std": 0.49396437406539917, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493200302124, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12425874173641205, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1065.78125, "completions/mean_terminated_length": 858.7189331054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5721591817164775, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12172322553595843, "kl": 0.0263671875, "learning_rate": 5.157197304530869e-07, "loss": 0.0848, "num_tokens": 1505831997.0, "reward": 2.3013393878936768, "reward_std": 0.4286647140979767, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 866.5313110351562, "completions/mean_terminated_length": 718.1055297851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5723722763837835, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13484552029380745, "kl": 0.03466796875, "learning_rate": 5.153859467600342e-07, "loss": 0.0419, "num_tokens": 1506292683.0, "reward": 2.4341518878936768, "reward_std": 0.416586309671402, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 994.5558471679688, "completions/mean_terminated_length": 779.3359985351562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5725853710510894, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11767949410187596, "kl": 0.028900146484375, "learning_rate": 5.150521822232106e-07, "loss": 0.1094, "num_tokens": 1506810356.0, "reward": 2.4375, "reward_std": 0.44715550541877747, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 992.4241333007812, "completions/mean_terminated_length": 780.1769409179688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.5727984657183954, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1411065824735259, "kl": 0.0284423828125, "learning_rate": 5.147184370273292e-07, "loss": 0.0787, "num_tokens": 1507322946.0, "reward": 2.5027902126312256, "reward_std": 0.44862544536590576, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13700605928897858, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1024.41748046875, "completions/mean_terminated_length": 774.2083740234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5730115603857013, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11972005072785544, "kl": 0.02655029296875, "learning_rate": 5.143847113570921e-07, "loss": 0.0401, "num_tokens": 1507858749.0, "reward": 2.3683037757873535, "reward_std": 0.40954843163490295, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13106490671634674, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1116.5848388671875, "completions/mean_terminated_length": 852.3724975585938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5732246550530073, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1171880841106862, "kl": 0.024078369140625, "learning_rate": 5.140510053971912e-07, "loss": 0.1078, "num_tokens": 1508429763.0, "reward": 2.345982313156128, "reward_std": 0.4725840091705322, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.18130891025066376, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1084.0023193359375, "completions/mean_terminated_length": 817.5982666015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5734377497203133, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13899350451442757, "kl": 0.025543212890625, "learning_rate": 5.137173193323071e-07, "loss": 0.1189, "num_tokens": 1508990196.0, "reward": 2.2527902126312256, "reward_std": 0.5071700811386108, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.16790206730365753, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1168.40625, "completions/mean_terminated_length": 871.7074584960938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5736508443876192, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12812073413287842, "kl": 0.022735595703125, "learning_rate": 5.133836533471098e-07, "loss": 0.0359, "num_tokens": 1509589002.0, "reward": 2.239955425262451, "reward_std": 0.42624154686927795, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559327840805, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 949.69873046875, "completions/mean_terminated_length": 776.5814208984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5738639390549252, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13048153114447855, "kl": 0.0302734375, "learning_rate": 5.130500076262575e-07, "loss": 0.1147, "num_tokens": 1510080451.0, "reward": 2.310267925262451, "reward_std": 0.43063560128211975, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15035313367843628, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 846.4063110351562, "completions/mean_terminated_length": 674.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5740770337222311, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.157715354974867, "kl": 0.034637451171875, "learning_rate": 5.127163823543975e-07, "loss": 0.0597, "num_tokens": 1510528537.0, "reward": 2.443080425262451, "reward_std": 0.40925729274749756, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14114972949028015, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 904.0781860351562, "completions/mean_terminated_length": 757.1259155273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5742901283895371, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1231995259338818, "kl": 0.028564453125, "learning_rate": 5.123827777161662e-07, "loss": 0.0303, "num_tokens": 1511006892.0, "reward": 2.5502233505249023, "reward_std": 0.36223557591438293, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 963.622802734375, "completions/mean_terminated_length": 759.4031982421875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.574503223056843, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13287240852693366, "kl": 0.0286865234375, "learning_rate": 5.12049193896188e-07, "loss": 0.12, "num_tokens": 1511509299.0, "reward": 2.474330425262451, "reward_std": 0.45131295919418335, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15610112249851227, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 974.7678833007812, "completions/mean_terminated_length": 769.2553100585938, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.574716317724149, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13090804531145267, "kl": 0.029083251953125, "learning_rate": 5.117156310790762e-07, "loss": 0.0788, "num_tokens": 1512014219.0, "reward": 2.3956475257873535, "reward_std": 0.5151432156562805, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16028662025928497, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 863.2344360351562, "completions/mean_terminated_length": 740.6724243164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5749294123914549, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12511088722009267, "kl": 0.03094482421875, "learning_rate": 5.113820894494324e-07, "loss": 0.0635, "num_tokens": 1512477364.0, "reward": 2.4949777126312256, "reward_std": 0.38574710488319397, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12086557596921921, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 958.32373046875, "completions/mean_terminated_length": 763.3289794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5751425070587608, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14338422206753407, "kl": 0.0283203125, "learning_rate": 5.110485691918458e-07, "loss": 0.0809, "num_tokens": 1512977445.0, "reward": 2.404017925262451, "reward_std": 0.3946557939052582, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12223286926746368, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 889.0870971679688, "completions/mean_terminated_length": 726.898193359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5753556017260668, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15457543743335214, "kl": 0.030242919921875, "learning_rate": 5.107150704908948e-07, "loss": 0.1501, "num_tokens": 1513439692.0, "reward": 2.478794813156128, "reward_std": 0.4300190508365631, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1502326875925064, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 992.5491333007812, "completions/mean_terminated_length": 790.4414672851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5755686963933727, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11210175375752965, "kl": 0.026458740234375, "learning_rate": 5.103815935311452e-07, "loss": 0.0593, "num_tokens": 1513952002.0, "reward": 2.364955425262451, "reward_std": 0.4000357389450073, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14649668335914612, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 966.3170166015625, "completions/mean_terminated_length": 752.2941284179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5757817910606787, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.7458028801964932, "kl": 0.034698486328125, "learning_rate": 5.100481384971511e-07, "loss": 0.0885, "num_tokens": 1514468112.0, "reward": 2.39453125, "reward_std": 0.3782336413860321, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15968577563762665, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 859.904052734375, "completions/mean_terminated_length": 740.2186889648438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5759948857279846, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15498907528530825, "kl": 0.029541015625, "learning_rate": 5.097147055734543e-07, "loss": 0.0647, "num_tokens": 1514923989.0, "reward": 2.5, "reward_std": 0.4385365843772888, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14672231674194336, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 973.0156860351562, "completions/mean_terminated_length": 790.5770263671875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5762079803952906, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1496255367495632, "kl": 0.03009033203125, "learning_rate": 5.093812949445844e-07, "loss": 0.0722, "num_tokens": 1515424956.0, "reward": 2.3627233505249023, "reward_std": 0.435127317905426, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1576608121395111, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 900.1295166015625, "completions/mean_terminated_length": 722.6237182617188, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5764210750625965, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1323522606038315, "kl": 0.02960205078125, "learning_rate": 5.090479067950587e-07, "loss": 0.0835, "num_tokens": 1515897350.0, "reward": 2.5513393878936768, "reward_std": 0.4319068491458893, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14238695800304413, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 997.2991333007812, "completions/mean_terminated_length": 765.4005126953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5766341697299026, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1320313717902181, "kl": 0.025604248046875, "learning_rate": 5.087145413093818e-07, "loss": 0.0808, "num_tokens": 1516418588.0, "reward": 2.4581475257873535, "reward_std": 0.441916823387146, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 909.66748046875, "completions/mean_terminated_length": 730.2403564453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.5768472643972085, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15513858688235604, "kl": 0.0338134765625, "learning_rate": 5.083811986720463e-07, "loss": 0.0208, "num_tokens": 1516895303.0, "reward": 2.5050225257873535, "reward_std": 0.411132276058197, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12450840324163437, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 978.3013916015625, "completions/mean_terminated_length": 763.2144775390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5770603590645144, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13063435725615272, "kl": 0.030303955078125, "learning_rate": 5.080478790675316e-07, "loss": 0.0956, "num_tokens": 1517403422.0, "reward": 2.453125, "reward_std": 0.3992384970188141, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 957.94873046875, "completions/mean_terminated_length": 749.2153930664062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5772734537318204, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14200426956042472, "kl": 0.03082275390625, "learning_rate": 5.077145826803048e-07, "loss": 0.0819, "num_tokens": 1517908183.0, "reward": 2.5223214626312256, "reward_std": 0.4615324139595032, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764310240745544, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16317449510097504, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1007.6428833007812, "completions/mean_terminated_length": 798.455810546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5774865483991263, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11509399639090413, "kl": 0.025054931640625, "learning_rate": 5.073813096948197e-07, "loss": 0.0749, "num_tokens": 1518426423.0, "reward": 2.3956475257873535, "reward_std": 0.47592949867248535, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16767141222953796, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1022.29248046875, "completions/mean_terminated_length": 768.0083618164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5776996430664323, "frac_reward_zero_std": 0.0, "grad_norm": 0.13347564688702754, "kl": 0.0257568359375, "learning_rate": 5.070480602955175e-07, "loss": 0.0419, "num_tokens": 1518952522.0, "reward": 2.3521206378936768, "reward_std": 0.4461229145526886, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13254131376743317, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 911.1138916015625, "completions/mean_terminated_length": 752.0076293945312, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5779127377337382, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11090236767233502, "kl": 0.0269775390625, "learning_rate": 5.067148346668263e-07, "loss": 0.0035, "num_tokens": 1519430637.0, "reward": 2.540736675262451, "reward_std": 0.3817024528980255, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12673446536064148, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 924.3348388671875, "completions/mean_terminated_length": 770.3299560546875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5781258324010442, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13829030450082372, "kl": 0.030181884765625, "learning_rate": 5.063816329931609e-07, "loss": 0.079, "num_tokens": 1519918515.0, "reward": 2.3431921005249023, "reward_std": 0.43478500843048096, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13776934146881104, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 885.0022583007812, "completions/mean_terminated_length": 755.1389770507812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5783389270683501, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12903955672837428, "kl": 0.0311279296875, "learning_rate": 5.060484554589229e-07, "loss": 0.0727, "num_tokens": 1520378324.0, "reward": 2.5128350257873535, "reward_std": 0.4313536584377289, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1024.83935546875, "completions/mean_terminated_length": 767.6201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5785520217356561, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13420294520411027, "kl": 0.02911376953125, "learning_rate": 5.057153022485005e-07, "loss": 0.0589, "num_tokens": 1520907660.0, "reward": 2.361049175262451, "reward_std": 0.4494055509567261, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.1634640097618103, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1060.790283203125, "completions/mean_terminated_length": 849.4363403320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.578765116402962, "frac_reward_zero_std": 0.0, "grad_norm": 0.12841310869216124, "kl": 0.0244140625, "learning_rate": 5.053821735462689e-07, "loss": 0.0964, "num_tokens": 1521456638.0, "reward": 2.3872768878936768, "reward_std": 0.45457085967063904, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13713014125823975, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 869.372802734375, "completions/mean_terminated_length": 697.5524291992188, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5789782110702679, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13006710909228478, "kl": 0.0323486328125, "learning_rate": 5.050490695365889e-07, "loss": 0.0311, "num_tokens": 1521919797.0, "reward": 2.428013563156128, "reward_std": 0.4147588014602661, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1293378323316574, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1043.5379638671875, "completions/mean_terminated_length": 804.9088745117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5791913057375739, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12511495016467727, "kl": 0.027557373046875, "learning_rate": 5.047159904038081e-07, "loss": 0.0681, "num_tokens": 1522455862.0, "reward": 2.2840402126312256, "reward_std": 0.4744742512702942, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17796529829502106, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1008.7277221679688, "completions/mean_terminated_length": 761.8287353515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5794044004048798, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12937673598406707, "kl": 0.02630615234375, "learning_rate": 5.043829363322605e-07, "loss": 0.0728, "num_tokens": 1522987740.0, "reward": 2.4497768878936768, "reward_std": 0.4099780023097992, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1039.9732666015625, "completions/mean_terminated_length": 817.4931640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5796174950721859, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1277239726023439, "kl": 0.026458740234375, "learning_rate": 5.040499075062658e-07, "loss": 0.103, "num_tokens": 1523522032.0, "reward": 2.2963171005249023, "reward_std": 0.4322567880153656, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12884463369846344, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 981.1094360351562, "completions/mean_terminated_length": 790.192138671875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5798305897394918, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12344985616488997, "kl": 0.02581787109375, "learning_rate": 5.037169041101303e-07, "loss": 0.1146, "num_tokens": 1524027953.0, "reward": 2.3934152126312256, "reward_std": 0.488517165184021, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14003422856330872, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 919.544677734375, "completions/mean_terminated_length": 738.2901611328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5800436844067978, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13659903885833286, "kl": 0.0299072265625, "learning_rate": 5.033839263281457e-07, "loss": 0.0799, "num_tokens": 1524516197.0, "reward": 2.4347100257873535, "reward_std": 0.4202975034713745, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13566918671131134, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 954.1741333007812, "completions/mean_terminated_length": 765.1884765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5802567790741037, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15970958000916086, "kl": 0.031097412109375, "learning_rate": 5.030509743445897e-07, "loss": 0.1002, "num_tokens": 1525011987.0, "reward": 2.357142925262451, "reward_std": 0.44088101387023926, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1540280133485794, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1011.2500610351562, "completions/mean_terminated_length": 809.4293212890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5804698737414096, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12502603522582514, "kl": 0.02569580078125, "learning_rate": 5.027180483437258e-07, "loss": 0.0604, "num_tokens": 1525531315.0, "reward": 2.4419643878936768, "reward_std": 0.4300384819507599, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.13150234520435333, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1031.2723388671875, "completions/mean_terminated_length": 803.4808349609375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5806829684087156, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12000109413842157, "kl": 0.026885986328125, "learning_rate": 5.023851485098028e-07, "loss": 0.0612, "num_tokens": 1526056493.0, "reward": 2.4363839626312256, "reward_std": 0.4176003634929657, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10928615182638168, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 977.0022583007812, "completions/mean_terminated_length": 817.7256469726562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5808960630760215, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10548450722403173, "kl": 0.027557373046875, "learning_rate": 5.020522750270558e-07, "loss": 0.0538, "num_tokens": 1526562862.0, "reward": 2.5083706378936768, "reward_std": 0.3959433138370514, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15845361351966858, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1007.8482666015625, "completions/mean_terminated_length": 828.1361083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5811091577433275, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1164570763765968, "kl": 0.0264892578125, "learning_rate": 5.017194280797042e-07, "loss": 0.0745, "num_tokens": 1527085466.0, "reward": 2.462611675262451, "reward_std": 0.435996949672699, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 866.216552734375, "completions/mean_terminated_length": 704.2461547851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5813222524106334, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15686892639953445, "kl": 0.0308837890625, "learning_rate": 5.013866078519539e-07, "loss": 0.0768, "num_tokens": 1527535355.0, "reward": 2.4363839626312256, "reward_std": 0.36757248640060425, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1566121131181717, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 962.7120971679688, "completions/mean_terminated_length": 788.3911743164062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5815353470779394, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11359357169129575, "kl": 0.026458740234375, "learning_rate": 5.010538145279949e-07, "loss": 0.0209, "num_tokens": 1528038554.0, "reward": 2.4793527126312256, "reward_std": 0.3137418329715729, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1427011638879776, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1050.337158203125, "completions/mean_terminated_length": 830.1444091796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5817484417452453, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12917058934510264, "kl": 0.0240478515625, "learning_rate": 5.00721048292003e-07, "loss": 0.0802, "num_tokens": 1528585713.0, "reward": 2.4676339626312256, "reward_std": 0.440024197101593, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13606999814510345, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 891.6563110351562, "completions/mean_terminated_length": 691.869140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.5819615364125513, "frac_reward_zero_std": 0.25, "grad_norm": 0.14747479017746679, "kl": 0.031890869140625, "learning_rate": 5.00388309328139e-07, "loss": 0.1074, "num_tokens": 1529044791.0, "reward": 2.5033483505249023, "reward_std": 0.33555570244789124, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4846842288970947, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15610112249851227, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1040.180908203125, "completions/mean_terminated_length": 843.9920043945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5821746310798572, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1224465610414118, "kl": 0.025787353515625, "learning_rate": 5.000555978205483e-07, "loss": 0.0634, "num_tokens": 1529574808.0, "reward": 2.4520089626312256, "reward_std": 0.3944771885871887, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13661938905715942, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1038.857177734375, "completions/mean_terminated_length": 858.2737426757812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5823877257471631, "frac_reward_zero_std": 0.0, "grad_norm": 0.12699152942196926, "kl": 0.027374267578125, "learning_rate": 4.997229139533613e-07, "loss": 0.0358, "num_tokens": 1530111512.0, "reward": 2.5, "reward_std": 0.515058696269989, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15048591792583466, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 870.8192138671875, "completions/mean_terminated_length": 716.2398681640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5826008204144691, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12121440941360011, "kl": 0.028900146484375, "learning_rate": 4.993902579106932e-07, "loss": 0.0399, "num_tokens": 1530572759.0, "reward": 2.634486675262451, "reward_std": 0.35433393716812134, "rewards/accuracy_reward/mean": 0.7008928656578064, "rewards/accuracy_reward/std": 0.45837873220443726, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12379446625709534, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 953.1920166015625, "completions/mean_terminated_length": 770.7239990234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.582813915081775, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13344528560239113, "kl": 0.031036376953125, "learning_rate": 4.990576298766434e-07, "loss": 0.077, "num_tokens": 1531071245.0, "reward": 2.4441964626312256, "reward_std": 0.4658892750740051, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17707304656505585, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1032.5223388671875, "completions/mean_terminated_length": 825.0591430664062, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5830270097490811, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12986098916455666, "kl": 0.027984619140625, "learning_rate": 4.987250300352961e-07, "loss": 0.0291, "num_tokens": 1531602455.0, "reward": 2.376674175262451, "reward_std": 0.4132971167564392, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 985.5938110351562, "completions/mean_terminated_length": 808.5260620117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.583240104416387, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12021896563034838, "kl": 0.028900146484375, "learning_rate": 4.983924585707199e-07, "loss": 0.0995, "num_tokens": 1532118113.0, "reward": 2.4637277126312256, "reward_std": 0.4351500868797302, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15628793835639954, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 903.591552734375, "completions/mean_terminated_length": 756.5767822265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.583453199083693, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12995656639750017, "kl": 0.029449462890625, "learning_rate": 4.980599156669676e-07, "loss": 0.0491, "num_tokens": 1532595322.0, "reward": 2.5691964626312256, "reward_std": 0.36906158924102783, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12425874918699265, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 967.1808471679688, "completions/mean_terminated_length": 735.7859497070312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5836662937509989, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12935954475736547, "kl": 0.02703857421875, "learning_rate": 4.977274015080764e-07, "loss": 0.0804, "num_tokens": 1533099003.0, "reward": 2.4966518878936768, "reward_std": 0.3702591061592102, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399912416934967, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 992.1897583007812, "completions/mean_terminated_length": 809.7722778320312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5838793884183048, "frac_reward_zero_std": 0.0, "grad_norm": 0.13912510550054555, "kl": 0.02740478515625, "learning_rate": 4.973949162780673e-07, "loss": 0.0787, "num_tokens": 1533608608.0, "reward": 2.4291296005249023, "reward_std": 0.48057666420936584, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14680634438991547, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 933.3348388671875, "completions/mean_terminated_length": 764.2724609375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5840924830856108, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1335605794019026, "kl": 0.03070068359375, "learning_rate": 4.970624601609455e-07, "loss": 0.0812, "num_tokens": 1534096246.0, "reward": 2.48828125, "reward_std": 0.3715760409832001, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1005.83935546875, "completions/mean_terminated_length": 819.347412109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5843055777529167, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13583239733942506, "kl": 0.026092529296875, "learning_rate": 4.967300333407e-07, "loss": 0.0579, "num_tokens": 1534622350.0, "reward": 2.3526787757873535, "reward_std": 0.4281512200832367, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1605832874774933, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1017.2277221679688, "completions/mean_terminated_length": 803.2937622070312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5845186724202227, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14571441991549142, "kl": 0.028350830078125, "learning_rate": 4.963976360013036e-07, "loss": 0.0777, "num_tokens": 1535148868.0, "reward": 2.4107143878936768, "reward_std": 0.4748704433441162, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.17342577874660492, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 934.1116333007812, "completions/mean_terminated_length": 731.3192749023438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5847317670875286, "frac_reward_zero_std": 0.0, "grad_norm": 0.14844399066971034, "kl": 0.0340576171875, "learning_rate": 4.960652683267125e-07, "loss": 0.0715, "num_tokens": 1535634822.0, "reward": 2.46875, "reward_std": 0.46500444412231445, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12672585248947144, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 944.15185546875, "completions/mean_terminated_length": 786.4591674804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5849448617548346, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12904060383443267, "kl": 0.029052734375, "learning_rate": 4.957329305008674e-07, "loss": 0.0522, "num_tokens": 1536129226.0, "reward": 2.3950893878936768, "reward_std": 0.3883579969406128, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10779669135808945, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1009.5469360351562, "completions/mean_terminated_length": 797.3897705078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5851579564221405, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.5190149681180043, "kl": 0.030364990234375, "learning_rate": 4.954006227076914e-07, "loss": 0.0594, "num_tokens": 1536656991.0, "reward": 2.443080425262451, "reward_std": 0.4300369322299957, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.14111433923244476, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 958.0960083007812, "completions/mean_terminated_length": 763.060546875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.5853710510894465, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12931311569296555, "kl": 0.032623291015625, "learning_rate": 4.950683451310913e-07, "loss": 0.0501, "num_tokens": 1537160218.0, "reward": 2.4715402126312256, "reward_std": 0.3861773908138275, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12035840004682541, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 979.27685546875, "completions/mean_terminated_length": 767.8181762695312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5855841457567524, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14186724362239053, "kl": 0.026947021484375, "learning_rate": 4.947360979549576e-07, "loss": 0.1013, "num_tokens": 1537672278.0, "reward": 2.4676339626312256, "reward_std": 0.39202460646629333, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1154.2679443359375, "completions/mean_terminated_length": 900.7449951171875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5857972404240583, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11708431994437764, "kl": 0.02374267578125, "learning_rate": 4.944038813631636e-07, "loss": 0.0659, "num_tokens": 1538259838.0, "reward": 2.3549108505249023, "reward_std": 0.44388580322265625, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 926.27685546875, "completions/mean_terminated_length": 791.6699829101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5860103350913644, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13073406091318251, "kl": 0.0321044921875, "learning_rate": 4.940716955395657e-07, "loss": 0.0605, "num_tokens": 1538740586.0, "reward": 2.4737725257873535, "reward_std": 0.3734135031700134, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12109260261058807, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1043.484375, "completions/mean_terminated_length": 847.9386596679688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5862234297586703, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1340401628349535, "kl": 0.025421142578125, "learning_rate": 4.937395406680035e-07, "loss": 0.1058, "num_tokens": 1539274675.0, "reward": 2.377232313156128, "reward_std": 0.4413833022117615, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14285962283611298, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1029.3817138671875, "completions/mean_terminated_length": 758.901123046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5864365244259763, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1188922580423902, "kl": 0.02655029296875, "learning_rate": 4.934074169322992e-07, "loss": 0.0571, "num_tokens": 1539816958.0, "reward": 2.4369421005249023, "reward_std": 0.36892932653427124, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.11001905798912048, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 890.794677734375, "completions/mean_terminated_length": 701.4337768554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5866496190932822, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14437006202999375, "kl": 0.029541015625, "learning_rate": 4.930753245162577e-07, "loss": 0.0791, "num_tokens": 1540282626.0, "reward": 2.5502233505249023, "reward_std": 0.3586868941783905, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10928615182638168, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1022.26123046875, "completions/mean_terminated_length": 802.6585693359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5868627137605882, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1293845638069153, "kl": 0.026702880859375, "learning_rate": 4.927432636036669e-07, "loss": 0.0534, "num_tokens": 1540810983.0, "reward": 2.3895089626312256, "reward_std": 0.4244554340839386, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1375301331281662, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 951.6964721679688, "completions/mean_terminated_length": 755.5158081054688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5870758084278941, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.23829936394360238, "kl": 0.02923583984375, "learning_rate": 4.924112343782971e-07, "loss": 0.078, "num_tokens": 1541307999.0, "reward": 2.4637277126312256, "reward_std": 0.3630683124065399, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10874075442552567, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1043.5982666015625, "completions/mean_terminated_length": 791.094970703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5872889030952, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12314977184411488, "kl": 0.024566650390625, "learning_rate": 4.920792370239009e-07, "loss": 0.062, "num_tokens": 1541848155.0, "reward": 2.463169813156128, "reward_std": 0.3783283531665802, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1028.04248046875, "completions/mean_terminated_length": 813.0243530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.587501997762506, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1397384329271977, "kl": 0.0291748046875, "learning_rate": 4.917472717242137e-07, "loss": 0.0849, "num_tokens": 1542375214.0, "reward": 2.3560268878936768, "reward_std": 0.49758273363113403, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1662929803133011, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 927.27685546875, "completions/mean_terminated_length": 757.295654296875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5877150924298119, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 6.431515629211438, "kl": 0.23150634765625, "learning_rate": 4.914153386629528e-07, "loss": 0.0893, "num_tokens": 1542861162.0, "reward": 2.4933037757873535, "reward_std": 0.46236032247543335, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.49251341819763184, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11611520498991013, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1002.88623046875, "completions/mean_terminated_length": 782.5648803710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5879281870971179, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13969255214275655, "kl": 0.028961181640625, "learning_rate": 4.910834380238175e-07, "loss": 0.0727, "num_tokens": 1543375863.0, "reward": 2.38671875, "reward_std": 0.46862998604774475, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14680634438991547, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 993.0982666015625, "completions/mean_terminated_length": 791.095703125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5881412817644238, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12951751927128352, "kl": 0.028961181640625, "learning_rate": 4.907515699904897e-07, "loss": 0.0467, "num_tokens": 1543894035.0, "reward": 2.3995537757873535, "reward_std": 0.4175123870372772, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09730304032564163, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 907.0089721679688, "completions/mean_terminated_length": 766.88720703125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5883543764317298, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1300193945552892, "kl": 0.030517578125, "learning_rate": 4.904197347466327e-07, "loss": 0.0908, "num_tokens": 1544365351.0, "reward": 2.5200893878936768, "reward_std": 0.4406583309173584, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12449963390827179, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1052.9398193359375, "completions/mean_terminated_length": 799.2969360351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5885674710990357, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12146183009950676, "kl": 0.02655029296875, "learning_rate": 4.900879324758922e-07, "loss": 0.1105, "num_tokens": 1544911004.0, "reward": 2.4581475257873535, "reward_std": 0.468540757894516, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1054.0848388671875, "completions/mean_terminated_length": 828.0712280273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5887805657663417, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12304890862021266, "kl": 0.027862548828125, "learning_rate": 4.897561633618951e-07, "loss": 0.0652, "num_tokens": 1545454002.0, "reward": 2.3046875, "reward_std": 0.47808051109313965, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18767967820167542, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1101.9085693359375, "completions/mean_terminated_length": 860.7479248046875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5889936604336476, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11027755035543774, "kl": 0.0238037109375, "learning_rate": 4.894244275882502e-07, "loss": 0.0722, "num_tokens": 1546016585.0, "reward": 2.424107313156128, "reward_std": 0.3524986505508423, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.08746545761823654, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 990.5313110351562, "completions/mean_terminated_length": 814.2864990234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5892067551009535, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12685517689018902, "kl": 0.028961181640625, "learning_rate": 4.890927253385481e-07, "loss": 0.0365, "num_tokens": 1546531415.0, "reward": 2.407924175262451, "reward_std": 0.43417537212371826, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1029.19873046875, "completions/mean_terminated_length": 765.9129028320312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5894198497682596, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1348522930847106, "kl": 0.02716064453125, "learning_rate": 4.887610567963605e-07, "loss": 0.0855, "num_tokens": 1547063840.0, "reward": 2.3911831378936768, "reward_std": 0.3990918695926666, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1121.5826416015625, "completions/mean_terminated_length": 801.648681640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5896329444355655, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1079506573575776, "kl": 0.023712158203125, "learning_rate": 4.884294221452405e-07, "loss": 0.0397, "num_tokens": 1547634789.0, "reward": 2.2963171005249023, "reward_std": 0.44277262687683105, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.16280589997768402, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1147.779052734375, "completions/mean_terminated_length": 882.3959350585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5898460391028715, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11446543844527704, "kl": 0.02337646484375, "learning_rate": 4.880978215687223e-07, "loss": 0.0517, "num_tokens": 1548212754.0, "reward": 2.2901787757873535, "reward_std": 0.41123324632644653, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1007.49560546875, "completions/mean_terminated_length": 794.9193725585938, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5900591337701774, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12570762342038644, "kl": 0.02630615234375, "learning_rate": 4.877662552503218e-07, "loss": 0.0943, "num_tokens": 1548724784.0, "reward": 2.4029018878936768, "reward_std": 0.4209842085838318, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12648425996303558, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1071.4754638671875, "completions/mean_terminated_length": 819.1151733398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5902722284374834, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12278755986742323, "kl": 0.028076171875, "learning_rate": 4.874347233735358e-07, "loss": 0.0826, "num_tokens": 1549277637.0, "reward": 2.3738839626312256, "reward_std": 0.5288773775100708, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9475446343421936, "rewards/tag_count_reward/std": 0.17865578830242157, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 966.6250610351562, "completions/mean_terminated_length": 792.9326171875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.5904853231047893, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13897269767154788, "kl": 0.0286865234375, "learning_rate": 4.871032261218409e-07, "loss": 0.0259, "num_tokens": 1549782973.0, "reward": 2.4447546005249023, "reward_std": 0.4732365608215332, "rewards/accuracy_reward/mean": 0.5879629850387573, "rewards/accuracy_reward/std": 0.4927723705768585, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14680634438991547, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1020.08935546875, "completions/mean_terminated_length": 796.6304321289062, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5906984177720953, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13729784362572958, "kl": 0.0264892578125, "learning_rate": 4.867717636786964e-07, "loss": 0.1184, "num_tokens": 1550312885.0, "reward": 2.3666296005249023, "reward_std": 0.4789509177207947, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15291258692741394, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1119.8148193359375, "completions/mean_terminated_length": 863.3076782226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5909115124394012, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11564290828594635, "kl": 0.02325439453125, "learning_rate": 4.864403362275407e-07, "loss": 0.0478, "num_tokens": 1550886450.0, "reward": 2.4090402126312256, "reward_std": 0.4232191741466522, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13064393401145935, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 908.12060546875, "completions/mean_terminated_length": 745.2805786132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5911246071067071, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1324840132015721, "kl": 0.031097412109375, "learning_rate": 4.861089439517939e-07, "loss": 0.0504, "num_tokens": 1551360680.0, "reward": 2.505580425262451, "reward_std": 0.44155511260032654, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1006.15185546875, "completions/mean_terminated_length": 796.6649169921875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5913377017740131, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13505613612280754, "kl": 0.029571533203125, "learning_rate": 4.857775870348562e-07, "loss": 0.0751, "num_tokens": 1551874876.0, "reward": 2.3761162757873535, "reward_std": 0.46554049849510193, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16289502382278442, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 943.357177734375, "completions/mean_terminated_length": 724.7914428710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.591550796441319, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13175352288456296, "kl": 0.028045654296875, "learning_rate": 4.854462656601083e-07, "loss": 0.0615, "num_tokens": 1552366412.0, "reward": 2.4464287757873535, "reward_std": 0.4057818353176117, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12935835123062134, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 985.8795166015625, "completions/mean_terminated_length": 802.3717651367188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.591763891108625, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11926111403859492, "kl": 0.0260009765625, "learning_rate": 4.851149800109113e-07, "loss": 0.0725, "num_tokens": 1552871334.0, "reward": 2.5541296005249023, "reward_std": 0.37367162108421326, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561822891235352, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 984.5513916015625, "completions/mean_terminated_length": 774.1363525390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5919769857759309, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1174080925434695, "kl": 0.02813720703125, "learning_rate": 4.84783730270606e-07, "loss": 0.0592, "num_tokens": 1553382413.0, "reward": 2.4799108505249023, "reward_std": 0.40824735164642334, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.1090860590338707, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 926.6295166015625, "completions/mean_terminated_length": 753.2216186523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.592190080443237, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.145289494256476, "kl": 0.032989501953125, "learning_rate": 4.844525166225145e-07, "loss": 0.0655, "num_tokens": 1553862599.0, "reward": 2.4877233505249023, "reward_std": 0.470186710357666, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14495466649532318, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1056.7098388671875, "completions/mean_terminated_length": 789.9320068359375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.5924031751105429, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11878164995385453, "kl": 0.024688720703125, "learning_rate": 4.841213392499375e-07, "loss": 0.0758, "num_tokens": 1554403861.0, "reward": 2.3989956378936768, "reward_std": 0.37878915667533875, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10703980922698975, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1004.0982666015625, "completions/mean_terminated_length": 804.2020874023438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.5926162697778488, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1243415865349804, "kl": 0.02520751953125, "learning_rate": 4.837901983361569e-07, "loss": 0.0816, "num_tokens": 1554922945.0, "reward": 2.5379464626312256, "reward_std": 0.36585915088653564, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10947445780038834, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1101.8504638671875, "completions/mean_terminated_length": 826.4581909179688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5928293644451548, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11381367930198392, "kl": 0.024505615234375, "learning_rate": 4.834590940644335e-07, "loss": 0.0572, "num_tokens": 1555484814.0, "reward": 2.421875, "reward_std": 0.41148841381073, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1282729059457779, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 986.060302734375, "completions/mean_terminated_length": 809.0703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.5930424591124607, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13663151511483282, "kl": 0.0286865234375, "learning_rate": 4.831280266180083e-07, "loss": 0.127, "num_tokens": 1555996793.0, "reward": 2.4994421005249023, "reward_std": 0.48856019973754883, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.17068946361541748, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 862.0156860351562, "completions/mean_terminated_length": 689.1227416992188, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5932555537797667, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13802137471156883, "kl": 0.032012939453125, "learning_rate": 4.827969961801017e-07, "loss": 0.0757, "num_tokens": 1556443440.0, "reward": 2.4737725257873535, "reward_std": 0.3226315677165985, "rewards/accuracy_reward/mean": 0.5601851940155029, "rewards/accuracy_reward/std": 0.496940016746521, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9871651530265808, "rewards/tag_count_reward/std": 0.07646700739860535, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 785.794677734375, "completions/mean_terminated_length": 655.2216796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5934686484470726, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14125665294641107, "kl": 0.032501220703125, "learning_rate": 4.824660029339137e-07, "loss": 0.083, "num_tokens": 1556856212.0, "reward": 2.5558037757873535, "reward_std": 0.40110641717910767, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1026.247802734375, "completions/mean_terminated_length": 780.00830078125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5936817431143786, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 57.62522610721485, "kl": 1.451904296875, "learning_rate": 4.821350470626239e-07, "loss": 0.1161, "num_tokens": 1557390899.0, "reward": 2.3292412757873535, "reward_std": 0.372689813375473, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 938.4888916015625, "completions/mean_terminated_length": 750.1906127929688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5938948377816845, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15258804010134852, "kl": 0.029510498046875, "learning_rate": 4.818041287493909e-07, "loss": 0.0896, "num_tokens": 1557883374.0, "reward": 2.4542412757873535, "reward_std": 0.4683377146720886, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15481625497341156, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1084.0960693359375, "completions/mean_terminated_length": 803.5360107421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5941079324489905, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1069040004224171, "kl": 0.02496337890625, "learning_rate": 4.814732481773527e-07, "loss": 0.0661, "num_tokens": 1558439689.0, "reward": 2.50390625, "reward_std": 0.37032830715179443, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09830980002880096, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1000.8460083007812, "completions/mean_terminated_length": 769.730224609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5943210271162964, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14165697878707825, "kl": 0.0302734375, "learning_rate": 4.811424055296263e-07, "loss": 0.1026, "num_tokens": 1558963748.0, "reward": 2.439732313156128, "reward_std": 0.4267694056034088, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15456202626228333, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 957.7678833007812, "completions/mean_terminated_length": 795.6307983398438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5945341217836023, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13670294685240122, "kl": 0.0308837890625, "learning_rate": 4.808116009893079e-07, "loss": 0.0163, "num_tokens": 1559455212.0, "reward": 2.443080425262451, "reward_std": 0.3932071924209595, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 880.9464721679688, "completions/mean_terminated_length": 757.0370483398438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5947472164509083, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13150223444352246, "kl": 0.032440185546875, "learning_rate": 4.804808347394724e-07, "loss": 0.0812, "num_tokens": 1559913428.0, "reward": 2.5574777126312256, "reward_std": 0.40665706992149353, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962118953466415, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1015.6295166015625, "completions/mean_terminated_length": 846.6961059570312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5949603111182142, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11295231772265461, "kl": 0.028656005859375, "learning_rate": 4.801501069631736e-07, "loss": 0.1033, "num_tokens": 1560431358.0, "reward": 2.4877233505249023, "reward_std": 0.443889319896698, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1401556432247162, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1027.9754638671875, "completions/mean_terminated_length": 812.9432373046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5951734057855202, "frac_reward_zero_std": 0.0, "grad_norm": 0.13960724355842696, "kl": 0.029510498046875, "learning_rate": 4.798194178434441e-07, "loss": 0.0552, "num_tokens": 1560966307.0, "reward": 2.33203125, "reward_std": 0.49274882674217224, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16544531285762787, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 965.77685546875, "completions/mean_terminated_length": 798.4226684570312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5953865004528262, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1330063353301934, "kl": 0.02777099609375, "learning_rate": 4.794887675632951e-07, "loss": 0.0752, "num_tokens": 1561467279.0, "reward": 2.4642858505249023, "reward_std": 0.42790815234184265, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11805575340986252, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 952.5379638671875, "completions/mean_terminated_length": 725.1778564453125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.5955995951201322, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13592517757588107, "kl": 0.027984619140625, "learning_rate": 4.791581563057156e-07, "loss": 0.0983, "num_tokens": 1561964080.0, "reward": 2.4754464626312256, "reward_std": 0.4108010232448578, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 990.8504638671875, "completions/mean_terminated_length": 785.0586547851562, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.5958126897874381, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12712627560726303, "kl": 0.027587890625, "learning_rate": 4.788275842536746e-07, "loss": 0.0847, "num_tokens": 1562476605.0, "reward": 2.4581475257873535, "reward_std": 0.4451054632663727, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15730707347393036, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1090.0804443359375, "completions/mean_terminated_length": 845.90478515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.596025784454744, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13149659682070125, "kl": 0.02325439453125, "learning_rate": 4.784970515901176e-07, "loss": 0.0802, "num_tokens": 1563030945.0, "reward": 2.3878350257873535, "reward_std": 0.44034963846206665, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14177079498767853, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1184.6898193359375, "completions/mean_terminated_length": 879.53173828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.59623887912205, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11223131520575452, "kl": 0.02471923828125, "learning_rate": 4.781665584979698e-07, "loss": 0.0594, "num_tokens": 1563638134.0, "reward": 2.2901787757873535, "reward_std": 0.4740681052207947, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.18907469511032104, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1040.02685546875, "completions/mean_terminated_length": 824.2276611328125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.5964519737893559, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13729277910788348, "kl": 0.027252197265625, "learning_rate": 4.778361051601333e-07, "loss": 0.1309, "num_tokens": 1564179490.0, "reward": 2.4207589626312256, "reward_std": 0.5194218754768372, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17913034558296204, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1068.0223388671875, "completions/mean_terminated_length": 811.2957763671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5966650684566619, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10694725886532257, "kl": 0.025848388671875, "learning_rate": 4.775056917594889e-07, "loss": 0.018, "num_tokens": 1564734268.0, "reward": 2.3152902126312256, "reward_std": 0.3530708849430084, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1240563914179802, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 969.6785888671875, "completions/mean_terminated_length": 742.3567504882812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5968781631239678, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1353107002243978, "kl": 0.031005859375, "learning_rate": 4.771753184788952e-07, "loss": 0.0933, "num_tokens": 1565241580.0, "reward": 2.43359375, "reward_std": 0.40118223428726196, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 975.1785888671875, "completions/mean_terminated_length": 769.74462890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5970912577912738, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14500113807686824, "kl": 0.0289306640625, "learning_rate": 4.768449855011884e-07, "loss": 0.061, "num_tokens": 1565752492.0, "reward": 2.5184152126312256, "reward_std": 0.4240882098674774, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14249980449676514, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 990.0938110351562, "completions/mean_terminated_length": 829.6400756835938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5973043524585797, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14096468938547524, "kl": 0.02813720703125, "learning_rate": 4.765146930091826e-07, "loss": 0.0968, "num_tokens": 1566261910.0, "reward": 2.4380581378936768, "reward_std": 0.4481762647628784, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1022.08935546875, "completions/mean_terminated_length": 805.8162231445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5975174471258857, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13166898100986094, "kl": 0.02740478515625, "learning_rate": 4.7618444118566934e-07, "loss": 0.073, "num_tokens": 1566783246.0, "reward": 2.396763563156128, "reward_std": 0.45847225189208984, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1475696861743927, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 1023.6563110351562, "completions/mean_terminated_length": 811.0565795898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5977305417931916, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 7.285055609327246, "kl": 0.039093017578125, "learning_rate": 4.7585423021341795e-07, "loss": 0.0503, "num_tokens": 1567308244.0, "reward": 2.388951063156128, "reward_std": 0.4243941605091095, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14443159103393555, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1077.51123046875, "completions/mean_terminated_length": 863.3160400390625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5979436364604975, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16797211907182072, "kl": 0.025299072265625, "learning_rate": 4.7552406027517477e-07, "loss": 0.075, "num_tokens": 1567867113.0, "reward": 2.329799175262451, "reward_std": 0.4269038736820221, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15912973880767822, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1091.1317138671875, "completions/mean_terminated_length": 873.54248046875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5981567311278035, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11755510535573062, "kl": 0.024322509765625, "learning_rate": 4.751939315536634e-07, "loss": 0.0657, "num_tokens": 1568432404.0, "reward": 2.3872768878936768, "reward_std": 0.44564881920814514, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709372282028198, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 923.93310546875, "completions/mean_terminated_length": 753.4447021484375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5983698257951094, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13295521685144893, "kl": 0.0301513671875, "learning_rate": 4.748638442315851e-07, "loss": 0.0688, "num_tokens": 1568923174.0, "reward": 2.454799175262451, "reward_std": 0.4380059242248535, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1475696861743927, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1117.7054443359375, "completions/mean_terminated_length": 870.677978515625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5985829204624155, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11375088305490733, "kl": 0.02398681640625, "learning_rate": 4.745337984916178e-07, "loss": 0.0705, "num_tokens": 1569495170.0, "reward": 2.279576063156128, "reward_std": 0.4200153946876526, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.8727678656578064, "rewards/format_reward/std": 0.3336053788661957, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 830.2835083007812, "completions/mean_terminated_length": 710.8995361328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5987960151297214, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12982746662993483, "kl": 0.0341796875, "learning_rate": 4.7420379451641656e-07, "loss": 0.0565, "num_tokens": 1569932097.0, "reward": 2.5825893878936768, "reward_std": 0.49067869782447815, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137671947479, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1353386640548706, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 951.3482666015625, "completions/mean_terminated_length": 778.490966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5990091097970274, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15890112557891883, "kl": 0.02825927734375, "learning_rate": 4.7387383248861347e-07, "loss": 0.1304, "num_tokens": 1570423549.0, "reward": 2.515625, "reward_std": 0.42147308588027954, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1003.9910888671875, "completions/mean_terminated_length": 826.8093872070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5992222044643333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13162983999777622, "kl": 0.02777099609375, "learning_rate": 4.7354391259081707e-07, "loss": 0.0904, "num_tokens": 1570948633.0, "reward": 2.458705425262451, "reward_std": 0.5123863816261292, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14262786507606506, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 943.4844360351562, "completions/mean_terminated_length": 738.9443969726562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5994352991316392, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14226413764588752, "kl": 0.028289794921875, "learning_rate": 4.732140350056129e-07, "loss": 0.1182, "num_tokens": 1571432978.0, "reward": 2.439732313156128, "reward_std": 0.3928465247154236, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1076.087158203125, "completions/mean_terminated_length": 778.5626831054688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5996483937989452, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1275971133564519, "kl": 0.0255126953125, "learning_rate": 4.728841999155626e-07, "loss": 0.0985, "num_tokens": 1571982745.0, "reward": 2.3627233505249023, "reward_std": 0.45270001888275146, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1693282276391983, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1014.97998046875, "completions/mean_terminated_length": 813.8853149414062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5998614884662511, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1284774697540026, "kl": 0.025665283203125, "learning_rate": 4.725544075032053e-07, "loss": 0.0887, "num_tokens": 1572511664.0, "reward": 2.3169643878936768, "reward_std": 0.42080557346343994, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1033.4554443359375, "completions/mean_terminated_length": 819.578369140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6000745831335571, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13404899368190185, "kl": 0.0263671875, "learning_rate": 4.7222465795105525e-07, "loss": 0.0839, "num_tokens": 1573042972.0, "reward": 2.513392925262451, "reward_std": 0.46064209938049316, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13212740421295166, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 924.2969360351562, "completions/mean_terminated_length": 740.4181518554688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.600287677800863, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14049292357184728, "kl": 0.028717041015625, "learning_rate": 4.7189495144160405e-07, "loss": 0.1036, "num_tokens": 1573524961.0, "reward": 2.462611675262451, "reward_std": 0.4035714864730835, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 997.7098388671875, "completions/mean_terminated_length": 758.876708984375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.600500772468169, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1363837907245474, "kl": 0.025146484375, "learning_rate": 4.715652881573187e-07, "loss": 0.0613, "num_tokens": 1574038591.0, "reward": 2.439174175262451, "reward_std": 0.3859565854072571, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09542291611433029, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 973.1451416015625, "completions/mean_terminated_length": 746.5540771484375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6007138671354749, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1256368810310527, "kl": 0.02777099609375, "learning_rate": 4.7123566828064265e-07, "loss": 0.0633, "num_tokens": 1574545104.0, "reward": 2.48828125, "reward_std": 0.36962586641311646, "rewards/accuracy_reward/mean": 0.5879629850387573, "rewards/accuracy_reward/std": 0.4927723705768585, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1340305656194687, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1056.946533203125, "completions/mean_terminated_length": 811.2534790039062, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6009269618027809, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13034787992930544, "kl": 0.02667236328125, "learning_rate": 4.709060919939953e-07, "loss": 0.0358, "num_tokens": 1575085288.0, "reward": 2.2393975257873535, "reward_std": 0.4536352753639221, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.151995450258255, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 968.4241333007812, "completions/mean_terminated_length": 708.249267578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6011400564700868, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.137201490972725, "kl": 0.0281982421875, "learning_rate": 4.7057655947977183e-07, "loss": 0.1299, "num_tokens": 1575584374.0, "reward": 2.44921875, "reward_std": 0.4403477609157562, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15014436841011047, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 948.37060546875, "completions/mean_terminated_length": 781.5886840820312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6013531511373927, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1308392054316944, "kl": 0.028656005859375, "learning_rate": 4.702470709203435e-07, "loss": 0.0047, "num_tokens": 1576075836.0, "reward": 2.4620537757873535, "reward_std": 0.44590461254119873, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1523328423500061, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 967.9598388671875, "completions/mean_terminated_length": 722.3616333007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6015662458046988, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13182988550173652, "kl": 0.029541015625, "learning_rate": 4.699176264980569e-07, "loss": 0.0847, "num_tokens": 1576580346.0, "reward": 2.4419643878936768, "reward_std": 0.4264742434024811, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1070.727783203125, "completions/mean_terminated_length": 889.7512817382812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6017793404720047, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1202016024418567, "kl": 0.02484130859375, "learning_rate": 4.695882263952341e-07, "loss": 0.0275, "num_tokens": 1577135360.0, "reward": 2.408482313156128, "reward_std": 0.3730555474758148, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09768717736005783, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 817.4910888671875, "completions/mean_terminated_length": 686.844482421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6019924351393107, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15864738773701906, "kl": 0.035888671875, "learning_rate": 4.6925887079417305e-07, "loss": 0.0994, "num_tokens": 1577560972.0, "reward": 2.646763563156128, "reward_std": 0.42769160866737366, "rewards/accuracy_reward/mean": 0.7321428656578064, "rewards/accuracy_reward/std": 0.4433377683162689, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14851415157318115, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1127.1585693359375, "completions/mean_terminated_length": 872.680908203125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6022055298066166, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1208160039068748, "kl": 0.02618408203125, "learning_rate": 4.6892955987714676e-07, "loss": 0.0887, "num_tokens": 1578139043.0, "reward": 2.21875, "reward_std": 0.46884554624557495, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15883232653141022, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 880.4129638671875, "completions/mean_terminated_length": 727.0934448242188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6024186244739226, "frac_reward_zero_std": 0.0, "grad_norm": 0.1362210172183119, "kl": 0.0322265625, "learning_rate": 4.686002938264038e-07, "loss": 0.0617, "num_tokens": 1578600956.0, "reward": 2.576451063156128, "reward_std": 0.4683157503604889, "rewards/accuracy_reward/mean": 0.6941964030265808, "rewards/accuracy_reward/std": 0.4612620174884796, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.12749071419239044, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 883.0000610351562, "completions/mean_terminated_length": 736.6431884765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6026317191412285, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1524470114260447, "kl": 0.031646728515625, "learning_rate": 4.682710728241673e-07, "loss": 0.0678, "num_tokens": 1579060588.0, "reward": 2.46875, "reward_std": 0.4039042592048645, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12842853367328644, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 832.5022583007812, "completions/mean_terminated_length": 703.4494018554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6028448138085345, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14810856417970533, "kl": 0.03253173828125, "learning_rate": 4.679418970526364e-07, "loss": 0.0833, "num_tokens": 1579500605.0, "reward": 2.400669813156128, "reward_std": 0.42462319135665894, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1026.5, "completions/mean_terminated_length": 787.3057861328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6030579084758404, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11555062285982094, "kl": 0.026397705078125, "learning_rate": 4.6761276669398465e-07, "loss": 0.0185, "num_tokens": 1580030573.0, "reward": 2.423549175262451, "reward_std": 0.3821007013320923, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11214316636323929, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 921.07373046875, "completions/mean_terminated_length": 729.81982421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6032710031431463, "frac_reward_zero_std": 0.25, "grad_norm": 0.13019577256502068, "kl": 0.029510498046875, "learning_rate": 4.672836819303599e-07, "loss": 0.0336, "num_tokens": 1580518030.0, "reward": 2.463169813156128, "reward_std": 0.37460607290267944, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.14405618607997894, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1045.169677734375, "completions/mean_terminated_length": 827.1630859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6034840978104523, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12102380319822655, "kl": 0.028289794921875, "learning_rate": 4.669546429438862e-07, "loss": 0.0772, "num_tokens": 1581056490.0, "reward": 2.342076063156128, "reward_std": 0.45620623230934143, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15941192209720612, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 744.4910888671875, "completions/mean_terminated_length": 569.5899047851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6036971924777582, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1344276443354919, "kl": 0.03466796875, "learning_rate": 4.6662564991666063e-07, "loss": 0.0498, "num_tokens": 1581453014.0, "reward": 2.6060268878936768, "reward_std": 0.33785903453826904, "rewards/accuracy_reward/mean": 0.6808035969734192, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389532685279846, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 942.2902221679688, "completions/mean_terminated_length": 698.2506713867188, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.6039102871450642, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14736744353647038, "kl": 0.0316162109375, "learning_rate": 4.662967030307565e-07, "loss": 0.0617, "num_tokens": 1581937944.0, "reward": 2.4614956378936768, "reward_std": 0.4201875925064087, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12015076726675034, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 922.1428833007812, "completions/mean_terminated_length": 758.0153198242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6041233818123701, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.20228246371415648, "kl": 0.02874755859375, "learning_rate": 4.6596780246822023e-07, "loss": 0.0556, "num_tokens": 1582418504.0, "reward": 2.4693081378936768, "reward_std": 0.4557398557662964, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12848563492298126, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1049.357177734375, "completions/mean_terminated_length": 838.8324584960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6043364764796761, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.7432683201551241, "kl": 0.04852294921875, "learning_rate": 4.656389484110729e-07, "loss": 0.0916, "num_tokens": 1582955544.0, "reward": 2.4302456378936768, "reward_std": 0.47149714827537537, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13254131376743317, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 995.216552734375, "completions/mean_terminated_length": 759.3469848632812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.604549571146982, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13523759984680744, "kl": 0.027313232421875, "learning_rate": 4.653101410413106e-07, "loss": 0.0373, "num_tokens": 1583469561.0, "reward": 2.3660714626312256, "reward_std": 0.42035916447639465, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16285285353660583, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 964.8125610351562, "completions/mean_terminated_length": 760.8169555664062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.604762665814288, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.5521666735325894, "kl": 0.0693359375, "learning_rate": 4.6498138054090254e-07, "loss": 0.0897, "num_tokens": 1583975621.0, "reward": 2.4330358505249023, "reward_std": 0.4382622241973877, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 964.8058471679688, "completions/mean_terminated_length": 780.973876953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.604975760481594, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13942987974062662, "kl": 0.0311279296875, "learning_rate": 4.6465266709179297e-07, "loss": 0.129, "num_tokens": 1584473774.0, "reward": 2.4575893878936768, "reward_std": 0.4858267605304718, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.15674357116222382, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 950.46435546875, "completions/mean_terminated_length": 700.8876953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6051888551488999, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13767875116422879, "kl": 0.028045654296875, "learning_rate": 4.6432400087589916e-07, "loss": 0.064, "num_tokens": 1584968302.0, "reward": 2.4174108505249023, "reward_std": 0.38686221837997437, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12537893652915955, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 975.8594360351562, "completions/mean_terminated_length": 777.3148193359375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6054019498162059, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14396323007202966, "kl": 0.028289794921875, "learning_rate": 4.6399538207511314e-07, "loss": 0.0719, "num_tokens": 1585478511.0, "reward": 2.4693081378936768, "reward_std": 0.4607437551021576, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12265980988740921, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 873.1897583007812, "completions/mean_terminated_length": 728.914794921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6056150444835118, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13316904134047614, "kl": 0.030303955078125, "learning_rate": 4.636668108713001e-07, "loss": 0.0513, "num_tokens": 1585937620.0, "reward": 2.5825893878936768, "reward_std": 0.4196329712867737, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09178353101015091, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 986.6808471679688, "completions/mean_terminated_length": 769.8521728515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6058281391508178, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13615299827012922, "kl": 0.0284423828125, "learning_rate": 4.6333828744629864e-07, "loss": 0.0606, "num_tokens": 1586442117.0, "reward": 2.4012277126312256, "reward_std": 0.42455416917800903, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14223673939704895, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 967.966552734375, "completions/mean_terminated_length": 696.44970703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6060412338181237, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13409888908545659, "kl": 0.029693603515625, "learning_rate": 4.63009811981922e-07, "loss": 0.0666, "num_tokens": 1586945126.0, "reward": 2.428013563156128, "reward_std": 0.40444400906562805, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1013.9688110351562, "completions/mean_terminated_length": 828.931640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6062543284854297, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1336980293682955, "kl": 0.0277099609375, "learning_rate": 4.6268138465995577e-07, "loss": 0.0952, "num_tokens": 1587465000.0, "reward": 2.400111675262451, "reward_std": 0.5023146867752075, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16255265474319458, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1053.9554443359375, "completions/mean_terminated_length": 804.0558471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6064674231527356, "frac_reward_zero_std": 0.0, "grad_norm": 0.13610729634938068, "kl": 0.02557373046875, "learning_rate": 4.623530056621595e-07, "loss": 0.1131, "num_tokens": 1588008228.0, "reward": 2.3705358505249023, "reward_std": 0.5395359992980957, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15850186347961426, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1048.05810546875, "completions/mean_terminated_length": 782.5367431640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6066805178200415, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12788533250369888, "kl": 0.02960205078125, "learning_rate": 4.6202467517026577e-07, "loss": 0.0909, "num_tokens": 1588540974.0, "reward": 2.4068081378936768, "reward_std": 0.4975346028804779, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1746300309896469, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 950.357177734375, "completions/mean_terminated_length": 790.3427124023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6068936124873475, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13990166966270953, "kl": 0.027801513671875, "learning_rate": 4.6169639336598044e-07, "loss": 0.0545, "num_tokens": 1589033070.0, "reward": 2.5122768878936768, "reward_std": 0.4686611592769623, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13503853976726532, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 986.1607666015625, "completions/mean_terminated_length": 779.4559936523438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6071067071546534, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1353569825824958, "kl": 0.029083251953125, "learning_rate": 4.613681604309824e-07, "loss": 0.0398, "num_tokens": 1589540198.0, "reward": 2.3515625, "reward_std": 0.35539567470550537, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765773296356, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1110.38623046875, "completions/mean_terminated_length": 816.1788940429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6073198018219594, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15425632903868391, "kl": 0.026641845703125, "learning_rate": 4.6103997654692306e-07, "loss": 0.0909, "num_tokens": 1590106051.0, "reward": 2.3387277126312256, "reward_std": 0.44057029485702515, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661912620067596, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 949.7433471679688, "completions/mean_terminated_length": 805.5277709960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6075328964892653, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12497472974267625, "kl": 0.02825927734375, "learning_rate": 4.6071184189542776e-07, "loss": 0.0298, "num_tokens": 1590596640.0, "reward": 2.4676339626312256, "reward_std": 0.3878638446331024, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.08062524348497391, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 754.4017944335938, "completions/mean_terminated_length": 651.537353515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6077459911565714, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.16273375666563764, "kl": 0.035888671875, "learning_rate": 4.6038375665809337e-07, "loss": 0.0983, "num_tokens": 1590993412.0, "reward": 2.62109375, "reward_std": 0.3242541253566742, "rewards/accuracy_reward/mean": 0.6941964030265808, "rewards/accuracy_reward/std": 0.4612620174884796, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067524552345276, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 933.0357666015625, "completions/mean_terminated_length": 753.9481811523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6079590858238773, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12126863403179937, "kl": 0.028228759765625, "learning_rate": 4.6005572101649003e-07, "loss": 0.0179, "num_tokens": 1591480628.0, "reward": 2.49609375, "reward_std": 0.35623809695243835, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10500273108482361, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 982.7500610351562, "completions/mean_terminated_length": 811.6476440429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6081721804911832, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1373511993335161, "kl": 0.030120849609375, "learning_rate": 4.5972773515216067e-07, "loss": 0.0583, "num_tokens": 1591987828.0, "reward": 2.4464287757873535, "reward_std": 0.5139682292938232, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 949.8973388671875, "completions/mean_terminated_length": 789.8158569335938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6083852751584892, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11349545906335115, "kl": 0.028594970703125, "learning_rate": 4.5939979924662e-07, "loss": 0.0426, "num_tokens": 1592484502.0, "reward": 2.47265625, "reward_std": 0.3707757294178009, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1276179403066635, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 973.2098388671875, "completions/mean_terminated_length": 825.903564453125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.6085983698257951, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.129508776677952, "kl": 0.029876708984375, "learning_rate": 4.5907191348135564e-07, "loss": 0.0723, "num_tokens": 1592990564.0, "reward": 2.4285714626312256, "reward_std": 0.4103049635887146, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10110349208116531, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 864.2142944335938, "completions/mean_terminated_length": 715.4974975585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6088114644931011, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14405055101098838, "kl": 0.032867431640625, "learning_rate": 4.5874407803782713e-07, "loss": 0.0232, "num_tokens": 1593444628.0, "reward": 2.5385046005249023, "reward_std": 0.4429364502429962, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10190140455961227, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1049.712158203125, "completions/mean_terminated_length": 874.1600952148438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.609024559160407, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13296464399506383, "kl": 0.025848388671875, "learning_rate": 4.5841629309746654e-07, "loss": 0.0612, "num_tokens": 1593982595.0, "reward": 2.4620537757873535, "reward_std": 0.48604339361190796, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14955390989780426, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1066.8192138671875, "completions/mean_terminated_length": 820.1536254882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.609237653827713, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12781966337556225, "kl": 0.025177001953125, "learning_rate": 4.5808855884167764e-07, "loss": 0.1013, "num_tokens": 1594529522.0, "reward": 2.443638563156128, "reward_std": 0.466962993144989, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 942.2098388671875, "completions/mean_terminated_length": 754.5430908203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6094507484950189, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12343636309814414, "kl": 0.029388427734375, "learning_rate": 4.5776087545183595e-07, "loss": 0.066, "num_tokens": 1595017664.0, "reward": 2.560826063156128, "reward_std": 0.36852946877479553, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.13000212609767914, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 893.5558471679688, "completions/mean_terminated_length": 704.646728515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6096638431623249, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13040082121280464, "kl": 0.030975341796875, "learning_rate": 4.5743324310928954e-07, "loss": 0.0893, "num_tokens": 1595479465.0, "reward": 2.4799108505249023, "reward_std": 0.3984030783176422, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1249600425362587, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 970.6741333007812, "completions/mean_terminated_length": 781.2230834960938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6098769378296308, "frac_reward_zero_std": 0.0, "grad_norm": 0.13817923490354794, "kl": 0.031524658203125, "learning_rate": 4.571056619953577e-07, "loss": 0.1226, "num_tokens": 1595988983.0, "reward": 2.607701063156128, "reward_std": 0.5265092849731445, "rewards/accuracy_reward/mean": 0.7455357313156128, "rewards/accuracy_reward/std": 0.4360465407371521, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16232208907604218, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1057.265625, "completions/mean_terminated_length": 825.2755126953125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6100900324969367, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1273777751826026, "kl": 0.027801513671875, "learning_rate": 4.5677813229133147e-07, "loss": 0.0841, "num_tokens": 1596527566.0, "reward": 2.4213171005249023, "reward_std": 0.46274977922439575, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14706972241401672, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 964.8192138671875, "completions/mean_terminated_length": 732.918701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6103031271642427, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.7698109684279082, "kl": 0.035400390625, "learning_rate": 4.564506541784734e-07, "loss": 0.0905, "num_tokens": 1597032285.0, "reward": 2.369419813156128, "reward_std": 0.46132078766822815, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.20024341344833374, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1034.2098388671875, "completions/mean_terminated_length": 813.8206787109375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6105162218315486, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1295568786814832, "kl": 0.026336669921875, "learning_rate": 4.561232278380177e-07, "loss": 0.0698, "num_tokens": 1597562315.0, "reward": 2.3828125, "reward_std": 0.43705669045448303, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1549774408340454, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1052.982177734375, "completions/mean_terminated_length": 839.9566650390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6107293164988546, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11780500883098978, "kl": 0.025787353515625, "learning_rate": 4.557958534511699e-07, "loss": 0.0589, "num_tokens": 1598099555.0, "reward": 2.385044813156128, "reward_std": 0.390415221452713, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.15348808467388153, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1083.1585693359375, "completions/mean_terminated_length": 823.4985961914062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6109424111661605, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10853455398672648, "kl": 0.025146484375, "learning_rate": 4.554685311991062e-07, "loss": 0.0302, "num_tokens": 1598664218.0, "reward": 2.3604912757873535, "reward_std": 0.39786389470100403, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14550481736660004, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 974.57373046875, "completions/mean_terminated_length": 755.271484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6111555058334666, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13763119087535428, "kl": 0.030059814453125, "learning_rate": 4.5514126126297504e-07, "loss": 0.0783, "num_tokens": 1599170971.0, "reward": 2.416294813156128, "reward_std": 0.4127218425273895, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14262787997722626, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 919.466552734375, "completions/mean_terminated_length": 751.6333618164062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6113686005007725, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13196013204500898, "kl": 0.031158447265625, "learning_rate": 4.5481404382389464e-07, "loss": 0.0799, "num_tokens": 1599647756.0, "reward": 2.556919813156128, "reward_std": 0.350829541683197, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13351379334926605, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 844.7611694335938, "completions/mean_terminated_length": 672.869873046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6115816951680784, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12061057348989138, "kl": 0.030731201171875, "learning_rate": 4.5448687906295535e-07, "loss": 0.0062, "num_tokens": 1600092641.0, "reward": 2.5184152126312256, "reward_std": 0.33801618218421936, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11721604317426682, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1083.790283203125, "completions/mean_terminated_length": 841.3910522460938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6117947898353844, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12397367537784326, "kl": 0.02496337890625, "learning_rate": 4.541597671612176e-07, "loss": 0.0661, "num_tokens": 1600651347.0, "reward": 2.3314733505249023, "reward_std": 0.4692508280277252, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17012260854244232, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1012.37060546875, "completions/mean_terminated_length": 769.8677978515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6120078845026903, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15030125635043512, "kl": 0.026519775390625, "learning_rate": 4.5383270829971267e-07, "loss": 0.1281, "num_tokens": 1601180393.0, "reward": 2.3911831378936768, "reward_std": 0.42565271258354187, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1040.94873046875, "completions/mean_terminated_length": 787.779296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6122209791699963, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13458235905205101, "kl": 0.026153564453125, "learning_rate": 4.5350570265944264e-07, "loss": 0.109, "num_tokens": 1601709794.0, "reward": 2.4386162757873535, "reward_std": 0.45369967818260193, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14601868391036987, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1053.5491943359375, "completions/mean_terminated_length": 810.4611206054688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6124340738373022, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12401696865521841, "kl": 0.027099609375, "learning_rate": 4.531787504213803e-07, "loss": 0.0801, "num_tokens": 1602244168.0, "reward": 2.404017925262451, "reward_std": 0.42932647466659546, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1108.763427734375, "completions/mean_terminated_length": 795.6845092773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6126471685046082, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12581406937974943, "kl": 0.023773193359375, "learning_rate": 4.5285185176646855e-07, "loss": 0.0913, "num_tokens": 1602813118.0, "reward": 2.4330358505249023, "reward_std": 0.34430232644081116, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11611521244049072, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1037.622802734375, "completions/mean_terminated_length": 797.5884399414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6128602631719141, "frac_reward_zero_std": 0.0, "grad_norm": 0.14057259627528182, "kl": 0.02569580078125, "learning_rate": 4.5252500687562087e-07, "loss": 0.0648, "num_tokens": 1603351397.0, "reward": 2.4419643878936768, "reward_std": 0.5148780345916748, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14955389499664307, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1065.450927734375, "completions/mean_terminated_length": 821.8662719726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6130733578392201, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11995240492235462, "kl": 0.02435302734375, "learning_rate": 4.5219821592972075e-07, "loss": 0.0708, "num_tokens": 1603898607.0, "reward": 2.3465402126312256, "reward_std": 0.4355149567127228, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 869.029052734375, "completions/mean_terminated_length": 704.0330810546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.613286452506526, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14101268544745005, "kl": 0.03240966796875, "learning_rate": 4.518714791096221e-07, "loss": 0.0815, "num_tokens": 1604359116.0, "reward": 2.509486675262451, "reward_std": 0.4213913381099701, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12909629940986633, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 937.4219360351562, "completions/mean_terminated_length": 735.2322387695312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6134995471738319, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1295449103127536, "kl": 0.030303955078125, "learning_rate": 4.515447965961484e-07, "loss": 0.049, "num_tokens": 1604841769.0, "reward": 2.545201063156128, "reward_std": 0.4607691168785095, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14921021461486816, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1097.6004638671875, "completions/mean_terminated_length": 841.8272094726562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6137126418411379, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1224310065558852, "kl": 0.023651123046875, "learning_rate": 4.512181685700939e-07, "loss": 0.0618, "num_tokens": 1605399158.0, "reward": 2.4095983505249023, "reward_std": 0.4590371549129486, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14409084618091583, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1104.4598388671875, "completions/mean_terminated_length": 789.9464721679688, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6139257365084438, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12425075919754522, "kl": 0.02557373046875, "learning_rate": 4.508915952122219e-07, "loss": 0.0908, "num_tokens": 1605973428.0, "reward": 2.279576063156128, "reward_std": 0.4760548174381256, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16850323975086212, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1041.6763916015625, "completions/mean_terminated_length": 799.1550903320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6141388311757499, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13368570307897557, "kl": 0.02606201171875, "learning_rate": 4.505650767032659e-07, "loss": 0.0843, "num_tokens": 1606514899.0, "reward": 2.2572546005249023, "reward_std": 0.4593934416770935, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.18073564767837524, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1029.5960693359375, "completions/mean_terminated_length": 856.7598266601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6143519258430558, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12265985993196467, "kl": 0.024810791015625, "learning_rate": 4.50238613223929e-07, "loss": 0.0847, "num_tokens": 1607045294.0, "reward": 2.416294813156128, "reward_std": 0.40520042181015015, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1613858938217163, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1062.779052734375, "completions/mean_terminated_length": 779.6695556640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6145650205103618, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12744129719216457, "kl": 0.0283203125, "learning_rate": 4.4991220495488325e-07, "loss": 0.1393, "num_tokens": 1607588331.0, "reward": 2.462611675262451, "reward_std": 0.4095012843608856, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 915.7232666015625, "completions/mean_terminated_length": 654.4285888671875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6147781151776677, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14409158958824078, "kl": 0.029388427734375, "learning_rate": 4.4958585207677134e-07, "loss": 0.0496, "num_tokens": 1608071743.0, "reward": 2.420201063156128, "reward_std": 0.4292536675930023, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14003422856330872, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 904.0000610351562, "completions/mean_terminated_length": 716.7999877929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6149912098449736, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.140985485151728, "kl": 0.027801513671875, "learning_rate": 4.4925955477020395e-07, "loss": 0.0663, "num_tokens": 1608548703.0, "reward": 2.5072546005249023, "reward_std": 0.38039425015449524, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.10128699988126755, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 918.6250610351562, "completions/mean_terminated_length": 747.3316040039062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6152043045122796, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14104766200789357, "kl": 0.030120849609375, "learning_rate": 4.489333132157622e-07, "loss": 0.0487, "num_tokens": 1609034935.0, "reward": 2.4486608505249023, "reward_std": 0.4112870395183563, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11321921646595001, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1141.888427734375, "completions/mean_terminated_length": 832.6168212890625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6154173991795855, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11475268355449765, "kl": 0.0240478515625, "learning_rate": 4.4860712759399556e-07, "loss": 0.0605, "num_tokens": 1609617525.0, "reward": 2.3236608505249023, "reward_std": 0.4319365620613098, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12337145209312439, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1013.37060546875, "completions/mean_terminated_length": 723.6742553710938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6156304938468915, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12897899610763983, "kl": 0.02655029296875, "learning_rate": 4.4828099808542265e-07, "loss": 0.069, "num_tokens": 1610139707.0, "reward": 2.365513563156128, "reward_std": 0.3626191318035126, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 898.0000610351562, "completions/mean_terminated_length": 663.0537719726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6158435885141974, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12901845046172786, "kl": 0.029083251953125, "learning_rate": 4.4795492487053155e-07, "loss": 0.0567, "num_tokens": 1610610299.0, "reward": 2.5306921005249023, "reward_std": 0.39017871022224426, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1005.7076416015625, "completions/mean_terminated_length": 765.1785888671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6160566831815034, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.1103107014736137, "kl": 0.026519775390625, "learning_rate": 4.476289081297786e-07, "loss": 0.0544, "num_tokens": 1611135880.0, "reward": 2.416294813156128, "reward_std": 0.3600604236125946, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9877232313156128, "rewards/tag_count_reward/std": 0.08601906895637512, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 897.1763916015625, "completions/mean_terminated_length": 759.0774536132812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6162697778488093, "frac_reward_zero_std": 0.25, "grad_norm": 0.12407218411372699, "kl": 0.0294189453125, "learning_rate": 4.4730294804358915e-07, "loss": 0.0739, "num_tokens": 1611602839.0, "reward": 2.5831475257873535, "reward_std": 0.3061332106590271, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10593781620264053, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1172.0023193359375, "completions/mean_terminated_length": 873.009033203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6164828725161153, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.16323903551672525, "kl": 0.02557373046875, "learning_rate": 4.469770447923572e-07, "loss": 0.0817, "num_tokens": 1612206968.0, "reward": 2.2734375, "reward_std": 0.37254172563552856, "rewards/accuracy_reward/mean": 0.37731480598449707, "rewards/accuracy_reward/std": 0.4852766990661621, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14553913474082947, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 989.2232666015625, "completions/mean_terminated_length": 796.4644165039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6166959671834212, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.10645483356306196, "kl": 0.027008056640625, "learning_rate": 4.4665119855644527e-07, "loss": 0.0709, "num_tokens": 1612717356.0, "reward": 2.463169813156128, "reward_std": 0.3140815496444702, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174960792064667, "rewards/tag_count_reward/mean": 0.9877232313156128, "rewards/tag_count_reward/std": 0.08921077847480774, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1075.3638916015625, "completions/mean_terminated_length": 847.611572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6169090618507271, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13148551061062969, "kl": 0.025665283203125, "learning_rate": 4.4632540951618423e-07, "loss": 0.09, "num_tokens": 1613270191.0, "reward": 2.435826063156128, "reward_std": 0.46753934025764465, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13802282512187958, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1079.325927734375, "completions/mean_terminated_length": 825.560546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6171221565180331, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.126206586864275, "kl": 0.023529052734375, "learning_rate": 4.459996778518733e-07, "loss": 0.104, "num_tokens": 1613817217.0, "reward": 2.40234375, "reward_std": 0.49909600615501404, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15693362057209015, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 994.9107666015625, "completions/mean_terminated_length": 762.4849853515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.617335251185339, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1357382529194367, "kl": 0.02984619140625, "learning_rate": 4.456740037437803e-07, "loss": 0.0963, "num_tokens": 1614321753.0, "reward": 2.3705358505249023, "reward_std": 0.47359156608581543, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14714714884757996, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1042.009033203125, "completions/mean_terminated_length": 806.4462890625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6175483458526451, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.391172870211858, "kl": 0.025909423828125, "learning_rate": 4.453483873721405e-07, "loss": 0.0723, "num_tokens": 1614864365.0, "reward": 2.3482143878936768, "reward_std": 0.44395479559898376, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16540852189064026, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1037.640625, "completions/mean_terminated_length": 769.3530883789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.617761440519951, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11583628232390675, "kl": 0.024871826171875, "learning_rate": 4.4502282891715816e-07, "loss": 0.0532, "num_tokens": 1615402316.0, "reward": 2.3158483505249023, "reward_std": 0.32073545455932617, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11055814474821091, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 979.7745971679688, "completions/mean_terminated_length": 781.9550170898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.617974535187257, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1448420653891271, "kl": 0.02825927734375, "learning_rate": 4.4469732855900463e-07, "loss": 0.0811, "num_tokens": 1615923559.0, "reward": 2.3856027126312256, "reward_std": 0.397357314825058, "rewards/accuracy_reward/mean": 0.47453704476356506, "rewards/accuracy_reward/std": 0.49993017315864563, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.08974618464708328, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 927.1585083007812, "completions/mean_terminated_length": 701.7882080078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6181876298545629, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14395714968330442, "kl": 0.02587890625, "learning_rate": 4.443718864778194e-07, "loss": 0.1535, "num_tokens": 1616405566.0, "reward": 2.5518975257873535, "reward_std": 0.4102056324481964, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11440251022577286, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 806.7455444335938, "completions/mean_terminated_length": 657.7949829101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6184007245218689, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.1239173052614156, "kl": 0.032806396484375, "learning_rate": 4.4404650285371003e-07, "loss": 0.0388, "num_tokens": 1616834668.0, "reward": 2.56640625, "reward_std": 0.3254980742931366, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9905133843421936, "rewards/tag_count_reward/std": 0.07129643112421036, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1069.5379638671875, "completions/mean_terminated_length": 791.97998046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6186138191891748, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1223114495098053, "kl": 0.02630615234375, "learning_rate": 4.43721177866751e-07, "loss": 0.0892, "num_tokens": 1617380045.0, "reward": 2.400669813156128, "reward_std": 0.3762684762477875, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10537806153297424, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 879.8170166015625, "completions/mean_terminated_length": 712.9336547851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6188269138564807, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13887514062787995, "kl": 0.0308837890625, "learning_rate": 4.433959116969854e-07, "loss": 0.0744, "num_tokens": 1617837259.0, "reward": 2.505580425262451, "reward_std": 0.4158722460269928, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1029.696533203125, "completions/mean_terminated_length": 844.3060913085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6190400085237867, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1210792802205966, "kl": 0.027587890625, "learning_rate": 4.4307070452442263e-07, "loss": 0.0554, "num_tokens": 1618365251.0, "reward": 2.4559152126312256, "reward_std": 0.4400137662887573, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1046096533536911, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 943.3504638671875, "completions/mean_terminated_length": 772.5283203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6192531031910926, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12240365164399232, "kl": 0.031097412109375, "learning_rate": 4.4274555652904036e-07, "loss": 0.0531, "num_tokens": 1618850368.0, "reward": 2.45703125, "reward_std": 0.45271119475364685, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12153510004281998, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 945.1160888671875, "completions/mean_terminated_length": 784.3375854492188, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.6194661978583986, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14737879200148382, "kl": 0.0301513671875, "learning_rate": 4.424204678907828e-07, "loss": 0.1093, "num_tokens": 1619335700.0, "reward": 2.4955358505249023, "reward_std": 0.42307984828948975, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13165414333343506, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 893.3817138671875, "completions/mean_terminated_length": 754.8274536132812, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6196792925257045, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13789178621337891, "kl": 0.03472900390625, "learning_rate": 4.420954387895616e-07, "loss": 0.094, "num_tokens": 1619805007.0, "reward": 2.4760046005249023, "reward_std": 0.4313313066959381, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 980.6094360351562, "completions/mean_terminated_length": 730.66943359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6198923871930105, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12767785544852975, "kl": 0.02801513671875, "learning_rate": 4.4177046940525584e-07, "loss": 0.0738, "num_tokens": 1620309712.0, "reward": 2.3833706378936768, "reward_std": 0.402037650346756, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14469929039478302, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 958.87060546875, "completions/mean_terminated_length": 750.3137817382812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6201054818603164, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.130251396607149, "kl": 0.026580810546875, "learning_rate": 4.414455599177108e-07, "loss": 0.1337, "num_tokens": 1620806486.0, "reward": 2.490513563156128, "reward_std": 0.4547174870967865, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661912620067596, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 891.1428833007812, "completions/mean_terminated_length": 719.0974731445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6203185765276223, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13619977988975016, "kl": 0.029693603515625, "learning_rate": 4.411207105067395e-07, "loss": 0.0908, "num_tokens": 1621270758.0, "reward": 2.5401787757873535, "reward_std": 0.42019975185394287, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11286582797765732, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 957.607177734375, "completions/mean_terminated_length": 775.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6205316711949284, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12758683090572753, "kl": 0.02935791015625, "learning_rate": 4.4079592135212086e-07, "loss": 0.0415, "num_tokens": 1621763206.0, "reward": 2.447544813156128, "reward_std": 0.3945702910423279, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.1151164099574089, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1139.6451416015625, "completions/mean_terminated_length": 840.4539794921875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6207447658622343, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11907224792520814, "kl": 0.022247314453125, "learning_rate": 4.4047119263360077e-07, "loss": 0.0456, "num_tokens": 1622350391.0, "reward": 2.3560268878936768, "reward_std": 0.3523109555244446, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11828288435935974, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 928.0402221679688, "completions/mean_terminated_length": 699.231201171875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6209578605295403, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14770157836194658, "kl": 0.03192138671875, "learning_rate": 4.4014652453089185e-07, "loss": 0.0807, "num_tokens": 1622836025.0, "reward": 2.4693081378936768, "reward_std": 0.42143645882606506, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11682131141424179, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 920.4397583007812, "completions/mean_terminated_length": 778.7864379882812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6211709551968462, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14731422213787046, "kl": 0.030609130859375, "learning_rate": 4.39821917223673e-07, "loss": 0.0643, "num_tokens": 1623320030.0, "reward": 2.4419643878936768, "reward_std": 0.42775759100914, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119430512189865, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 793.6652221679688, "completions/mean_terminated_length": 663.9063720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6213840498641522, "frac_reward_zero_std": 0.25, "grad_norm": 0.13074259016898343, "kl": 0.03643798828125, "learning_rate": 4.3949737089158977e-07, "loss": 0.0513, "num_tokens": 1623734520.0, "reward": 2.5970983505249023, "reward_std": 0.3793795704841614, "rewards/accuracy_reward/mean": 0.6808035969734192, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585100531578064, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1048.977783203125, "completions/mean_terminated_length": 835.0948486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6215971445314581, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11005979204053319, "kl": 0.024169921875, "learning_rate": 4.3917288571425314e-07, "loss": 0.1006, "num_tokens": 1624274462.0, "reward": 2.400669813156128, "reward_std": 0.37523284554481506, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9709821343421936, "rewards/format_reward/std": 0.16804419457912445, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.12218689918518066, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1060.68310546875, "completions/mean_terminated_length": 791.414794921875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.6218102391987641, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1280998146725062, "kl": 0.02642822265625, "learning_rate": 4.388484618712415e-07, "loss": 0.1334, "num_tokens": 1624815872.0, "reward": 2.3800225257873535, "reward_std": 0.47886040806770325, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15730705857276917, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 968.6027221679688, "completions/mean_terminated_length": 782.1099853515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.62202333386607, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1176024413660939, "kl": 0.027008056640625, "learning_rate": 4.3852409954209836e-07, "loss": 0.1117, "num_tokens": 1625321726.0, "reward": 2.4972100257873535, "reward_std": 0.3666824400424957, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.09129084646701813, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 914.2500610351562, "completions/mean_terminated_length": 745.6410522460938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6222364285333759, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14159414822526759, "kl": 0.0291748046875, "learning_rate": 4.38199798906333e-07, "loss": 0.0798, "num_tokens": 1625797454.0, "reward": 2.5496652126312256, "reward_std": 0.4418925344944, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.48100295662879944, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15107274055480957, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 934.857177734375, "completions/mean_terminated_length": 718.1653442382812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6224495232006819, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14736922970767907, "kl": 0.026611328125, "learning_rate": 4.378755601434216e-07, "loss": 0.0952, "num_tokens": 1626284302.0, "reward": 2.454799175262451, "reward_std": 0.37240859866142273, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12245608121156693, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1090.13623046875, "completions/mean_terminated_length": 862.577392578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6226626178679878, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11715308222447135, "kl": 0.025146484375, "learning_rate": 4.375513834328052e-07, "loss": 0.0684, "num_tokens": 1626840107.0, "reward": 2.3431921005249023, "reward_std": 0.45546308159828186, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518688440323, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 938.700927734375, "completions/mean_terminated_length": 701.2086791992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6228757125352938, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1302772180264295, "kl": 0.026824951171875, "learning_rate": 4.3722726895389097e-07, "loss": 0.0633, "num_tokens": 1627325557.0, "reward": 2.4263393878936768, "reward_std": 0.3694835603237152, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13422717154026031, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 986.2813110351562, "completions/mean_terminated_length": 748.4097900390625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6230888072025997, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13089899546125877, "kl": 0.02618408203125, "learning_rate": 4.369032168860513e-07, "loss": 0.1192, "num_tokens": 1627844163.0, "reward": 2.42578125, "reward_std": 0.4136088490486145, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 918.0379638671875, "completions/mean_terminated_length": 687.1854858398438, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6233019018699058, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13922063077601252, "kl": 0.031402587890625, "learning_rate": 4.3657922740862416e-07, "loss": 0.1143, "num_tokens": 1628320484.0, "reward": 2.4888393878936768, "reward_std": 0.3887077271938324, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14672234654426575, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 924.0000610351562, "completions/mean_terminated_length": 729.801025390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6235149965372117, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11395968150981141, "kl": 0.0279541015625, "learning_rate": 4.362553007009131e-07, "loss": 0.0392, "num_tokens": 1628803156.0, "reward": 2.50390625, "reward_std": 0.33635610342025757, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.1038430780172348, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 863.1004638671875, "completions/mean_terminated_length": 693.8290405273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6237280912045176, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15620490141293092, "kl": 0.03082275390625, "learning_rate": 4.359314369421866e-07, "loss": 0.0986, "num_tokens": 1629258449.0, "reward": 2.3989956378936768, "reward_std": 0.46517857909202576, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16737332940101624, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1020.904052734375, "completions/mean_terminated_length": 814.3834228515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6239411858718236, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13903351964370164, "kl": 0.02783203125, "learning_rate": 4.3560763631167876e-07, "loss": 0.0926, "num_tokens": 1629780918.0, "reward": 2.4213171005249023, "reward_std": 0.5255187153816223, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.18306908011436462, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 947.1785888671875, "completions/mean_terminated_length": 707.8695678710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6241542805391295, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12844324080117073, "kl": 0.02734375, "learning_rate": 4.352838989885882e-07, "loss": 0.0866, "num_tokens": 1630269718.0, "reward": 2.404017925262451, "reward_std": 0.3857608139514923, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.18154285848140717, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1104.671875, "completions/mean_terminated_length": 837.0802612304688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6243673752064355, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.3006043144557853, "kl": 0.075439453125, "learning_rate": 4.349602251520786e-07, "loss": 0.0535, "num_tokens": 1630855827.0, "reward": 2.34375, "reward_std": 0.46391230821609497, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14088857173919678, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1042.634033203125, "completions/mean_terminated_length": 800.3434448242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6245804698737414, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1403942875675658, "kl": 0.02410888671875, "learning_rate": 4.34636614981279e-07, "loss": 0.0545, "num_tokens": 1631397999.0, "reward": 2.373326063156128, "reward_std": 0.39641252160072327, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14590215682983398, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 847.5848388671875, "completions/mean_terminated_length": 703.5349731445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6247935645410474, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1352385194778094, "kl": 0.031646728515625, "learning_rate": 4.343130686552826e-07, "loss": 0.0595, "num_tokens": 1631850549.0, "reward": 2.4972100257873535, "reward_std": 0.4342727065086365, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1397307962179184, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 923.5938110351562, "completions/mean_terminated_length": 742.9896240234375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6250066592083533, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.20393504962754053, "kl": 0.027984619140625, "learning_rate": 4.3398958635314764e-07, "loss": 0.0791, "num_tokens": 1632326399.0, "reward": 2.4146206378936768, "reward_std": 0.43712449073791504, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13572438061237335, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 985.935302734375, "completions/mean_terminated_length": 775.7941284179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6252197538756593, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12234409285833742, "kl": 0.028289794921875, "learning_rate": 4.3366616825389666e-07, "loss": 0.0734, "num_tokens": 1632841490.0, "reward": 2.390625, "reward_std": 0.38307416439056396, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 959.8594360351562, "completions/mean_terminated_length": 775.18798828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6254328485429652, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12935150974258922, "kl": 0.02691650390625, "learning_rate": 4.333428145365172e-07, "loss": 0.0629, "num_tokens": 1633338643.0, "reward": 2.4347100257873535, "reward_std": 0.41431596875190735, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780035495758057, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1006.685302734375, "completions/mean_terminated_length": 783.7479858398438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.6256459432102711, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1317130484853896, "kl": 0.02923583984375, "learning_rate": 4.3301952537996047e-07, "loss": 0.0644, "num_tokens": 1633856998.0, "reward": 2.3861608505249023, "reward_std": 0.4497280418872833, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15456201136112213, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1026.2388916015625, "completions/mean_terminated_length": 793.8931884765625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6258590378775771, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.134650522901962, "kl": 0.026824951171875, "learning_rate": 4.3269630096314224e-07, "loss": 0.083, "num_tokens": 1634391601.0, "reward": 2.4425225257873535, "reward_std": 0.3779553472995758, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10811902582645416, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 889.919677734375, "completions/mean_terminated_length": 727.8473510742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.626072132544883, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13141837685870725, "kl": 0.032928466796875, "learning_rate": 4.3237314146494275e-07, "loss": 0.0529, "num_tokens": 1634853597.0, "reward": 2.5630581378936768, "reward_std": 0.40721070766448975, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13017487525939941, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1092.513427734375, "completions/mean_terminated_length": 842.2027587890625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.626285227212189, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13281598774204922, "kl": 0.02484130859375, "learning_rate": 4.3205004706420565e-07, "loss": 0.0761, "num_tokens": 1635413091.0, "reward": 2.338169813156128, "reward_std": 0.4590991735458374, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17070867121219635, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 971.0313110351562, "completions/mean_terminated_length": 804.4896850585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.626498321879495, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13274640130248275, "kl": 0.029815673828125, "learning_rate": 4.3172701793973953e-07, "loss": 0.0403, "num_tokens": 1635913153.0, "reward": 2.439174175262451, "reward_std": 0.4147360920906067, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1086.9442138671875, "completions/mean_terminated_length": 828.3031005859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.626711416546801, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13791095620338129, "kl": 0.023162841796875, "learning_rate": 4.314040542703158e-07, "loss": 0.0393, "num_tokens": 1636474296.0, "reward": 2.3309152126312256, "reward_std": 0.40279242396354675, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11757759749889374, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1017.638427734375, "completions/mean_terminated_length": 776.369140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6269245112141069, "frac_reward_zero_std": 0.0, "grad_norm": 0.13334995685296192, "kl": 0.027252197265625, "learning_rate": 4.3108115623467024e-07, "loss": 0.0646, "num_tokens": 1637002470.0, "reward": 2.4347100257873535, "reward_std": 0.4832659959793091, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 895.325927734375, "completions/mean_terminated_length": 720.4987182617188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6271376058814128, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14117247860426582, "kl": 0.03204345703125, "learning_rate": 4.3075832401150237e-07, "loss": 0.0614, "num_tokens": 1637471048.0, "reward": 2.497767925262451, "reward_std": 0.45960891246795654, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.17274779081344604, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 914.26123046875, "completions/mean_terminated_length": 742.305908203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6273507005487188, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16033518372201397, "kl": 0.031585693359375, "learning_rate": 4.3043555777947483e-07, "loss": 0.0951, "num_tokens": 1637953709.0, "reward": 2.446986675262451, "reward_std": 0.4910048246383667, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764795243740082, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1005.7500610351562, "completions/mean_terminated_length": 812.74072265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6275637952160247, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.123985975829371, "kl": 0.02734375, "learning_rate": 4.3011285771721416e-07, "loss": 0.0652, "num_tokens": 1638475853.0, "reward": 2.474330425262451, "reward_std": 0.42471638321876526, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12473505735397339, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 987.5535888671875, "completions/mean_terminated_length": 770.9031982421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6277768898833307, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13642001030688675, "kl": 0.029815673828125, "learning_rate": 4.297902240033102e-07, "loss": 0.1084, "num_tokens": 1638985141.0, "reward": 2.3487725257873535, "reward_std": 0.3983851969242096, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13802284002304077, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1026.44873046875, "completions/mean_terminated_length": 776.7361450195312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6279899845506366, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15240731021591938, "kl": 0.03106689453125, "learning_rate": 4.2946765681631605e-07, "loss": 0.0767, "num_tokens": 1639512750.0, "reward": 2.3716518878936768, "reward_std": 0.5022507309913635, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1642991006374359, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 978.9442138671875, "completions/mean_terminated_length": 835.5012817382812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6282030792179426, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14541485338549148, "kl": 0.027587890625, "learning_rate": 4.29145156334748e-07, "loss": 0.0962, "num_tokens": 1640015669.0, "reward": 2.415736675262451, "reward_std": 0.42475083470344543, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1021.0870971679688, "completions/mean_terminated_length": 814.603271484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6284161738852485, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11826521985297453, "kl": 0.025604248046875, "learning_rate": 4.288227227370851e-07, "loss": 0.086, "num_tokens": 1640550332.0, "reward": 2.4090402126312256, "reward_std": 0.4001913368701935, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13700605928897858, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 958.6964721679688, "completions/mean_terminated_length": 777.1458740234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6286292685525545, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13759595384798867, "kl": 0.02801513671875, "learning_rate": 4.2850035620176994e-07, "loss": 0.0695, "num_tokens": 1641048196.0, "reward": 2.4341518878936768, "reward_std": 0.484378457069397, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16713166236877441, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1076.5826416015625, "completions/mean_terminated_length": 808.128173828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6288423632198604, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10844120930873578, "kl": 0.022369384765625, "learning_rate": 4.2817805690720744e-07, "loss": 0.0528, "num_tokens": 1641601257.0, "reward": 2.3995537757873535, "reward_std": 0.3386138677597046, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9665178656578064, "rewards/format_reward/std": 0.18009299039840698, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10110348463058472, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 797.3660888671875, "completions/mean_terminated_length": 694.656982421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6290554578871663, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12832393414898433, "kl": 0.0322265625, "learning_rate": 4.27855825031766e-07, "loss": 0.0788, "num_tokens": 1642022173.0, "reward": 2.6060268878936768, "reward_std": 0.3501691222190857, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389531940221786, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 998.5067138671875, "completions/mean_terminated_length": 800.8567504882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6292685525544723, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13658840187630403, "kl": 0.026214599609375, "learning_rate": 4.2753366075377595e-07, "loss": 0.0842, "num_tokens": 1642540400.0, "reward": 2.5066964626312256, "reward_std": 0.4364209473133087, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14753690361976624, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1129.571533203125, "completions/mean_terminated_length": 920.7233276367188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6294816472217782, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12549913842168856, "kl": 0.023651123046875, "learning_rate": 4.272115642515305e-07, "loss": 0.0533, "num_tokens": 1643121152.0, "reward": 2.3080358505249023, "reward_std": 0.46695148944854736, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.17342577874660492, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 903.0178833007812, "completions/mean_terminated_length": 775.166259765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6296947418890843, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13263076842967625, "kl": 0.027984619140625, "learning_rate": 4.26889535703286e-07, "loss": 0.0264, "num_tokens": 1643590888.0, "reward": 2.521205425262451, "reward_std": 0.3816625773906708, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09310024231672287, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1034.3125, "completions/mean_terminated_length": 807.2021484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6299078365563902, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1276400854732765, "kl": 0.025390625, "learning_rate": 4.265675752872597e-07, "loss": 0.0679, "num_tokens": 1644123220.0, "reward": 2.4112725257873535, "reward_std": 0.4529055953025818, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14635494351387024, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 988.8817138671875, "completions/mean_terminated_length": 796.0607299804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6301209312236962, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13500087789665793, "kl": 0.02618408203125, "learning_rate": 4.262456831816329e-07, "loss": 0.0767, "num_tokens": 1644642815.0, "reward": 2.3136162757873535, "reward_std": 0.4059527516365051, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10032878816127777, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 872.4285888671875, "completions/mean_terminated_length": 741.1612548828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6303340258910021, "frac_reward_zero_std": 0.0, "grad_norm": 0.14888490151885184, "kl": 0.0325927734375, "learning_rate": 4.259238595645476e-07, "loss": 0.0619, "num_tokens": 1645098143.0, "reward": 2.517299175262451, "reward_std": 0.4469611644744873, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13206008076667786, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1018.5491333007812, "completions/mean_terminated_length": 808.231201171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6305471205583081, "frac_reward_zero_std": 0.0, "grad_norm": 0.1299084741473585, "kl": 0.027984619140625, "learning_rate": 4.2560210461410906e-07, "loss": 0.0992, "num_tokens": 1645621205.0, "reward": 2.384486675262451, "reward_std": 0.5200793147087097, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.18502935767173767, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 933.7388916015625, "completions/mean_terminated_length": 713.2700805664062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.630760215225614, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.16559433291797698, "kl": 0.0306396484375, "learning_rate": 4.252804185083837e-07, "loss": 0.0862, "num_tokens": 1646121152.0, "reward": 2.415736675262451, "reward_std": 0.45371320843696594, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15371057391166687, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1016.0647583007812, "completions/mean_terminated_length": 784.8660888671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6309733098929199, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13604954067442032, "kl": 0.028472900390625, "learning_rate": 4.2495880142540007e-07, "loss": 0.1047, "num_tokens": 1646646125.0, "reward": 2.416294813156128, "reward_std": 0.5145922899246216, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17598041892051697, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1002.013427734375, "completions/mean_terminated_length": 760.6318969726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6311864045602259, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12936300938247594, "kl": 0.02728271484375, "learning_rate": 4.2463725354314893e-07, "loss": 0.0676, "num_tokens": 1647166707.0, "reward": 2.3973214626312256, "reward_std": 0.467938095331192, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16981780529022217, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 953.1451416015625, "completions/mean_terminated_length": 753.8179931640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6313994992275318, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13145552105635308, "kl": 0.0316162109375, "learning_rate": 4.2431577503958217e-07, "loss": 0.0881, "num_tokens": 1647665908.0, "reward": 2.455357313156128, "reward_std": 0.4772571623325348, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18066051602363586, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1024.34375, "completions/mean_terminated_length": 825.072021484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6316125938948378, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13959372094947137, "kl": 0.0267333984375, "learning_rate": 4.239943660926136e-07, "loss": 0.1096, "num_tokens": 1648192990.0, "reward": 2.4603796005249023, "reward_std": 0.43478232622146606, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11395422369241714, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 992.99560546875, "completions/mean_terminated_length": 705.26708984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6318256885621437, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1325381857666098, "kl": 0.025482177734375, "learning_rate": 4.2367302688011874e-07, "loss": 0.0888, "num_tokens": 1648710812.0, "reward": 2.361049175262451, "reward_std": 0.3898082673549652, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 968.1897583007812, "completions/mean_terminated_length": 729.866455078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6320387832294497, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14864640967625145, "kl": 0.029632568359375, "learning_rate": 4.233517575799338e-07, "loss": 0.1231, "num_tokens": 1649216353.0, "reward": 2.373326063156128, "reward_std": 0.4889039397239685, "rewards/accuracy_reward/mean": 0.5115740895271301, "rewards/accuracy_reward/std": 0.5004456043243408, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16288256645202637, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1023.6629638671875, "completions/mean_terminated_length": 797.5830688476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6322518778967556, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13637078549174073, "kl": 0.029632568359375, "learning_rate": 4.230305583698569e-07, "loss": 0.0914, "num_tokens": 1649740602.0, "reward": 2.4056921005249023, "reward_std": 0.45324209332466125, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16934573650360107, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1039.196533203125, "completions/mean_terminated_length": 792.6000366210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6324649725640615, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12530244172115756, "kl": 0.026702880859375, "learning_rate": 4.227094294276473e-07, "loss": 0.0763, "num_tokens": 1650284770.0, "reward": 2.4637277126312256, "reward_std": 0.41138678789138794, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14895901083946228, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 939.5156860351562, "completions/mean_terminated_length": 787.5913696289062, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.6326780672313675, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1416997760616862, "kl": 0.02728271484375, "learning_rate": 4.223883709310252e-07, "loss": 0.0573, "num_tokens": 1650774201.0, "reward": 2.3950893878936768, "reward_std": 0.3920544981956482, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09730304032564163, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1149.805908203125, "completions/mean_terminated_length": 809.8738403320312, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6328911618986734, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11939228816286149, "kl": 0.024169921875, "learning_rate": 4.22067383057672e-07, "loss": 0.0611, "num_tokens": 1651354146.0, "reward": 2.3208706378936768, "reward_std": 0.5122708678245544, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17529362440109253, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 997.497802734375, "completions/mean_terminated_length": 806.2454223632812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6331042565659795, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.9931877282475534, "kl": 0.03173828125, "learning_rate": 4.217464659852299e-07, "loss": 0.0791, "num_tokens": 1651875329.0, "reward": 2.443080425262451, "reward_std": 0.4361160695552826, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12758491933345795, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1067.91748046875, "completions/mean_terminated_length": 821.5278930664062, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6333173512332854, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11457908333525443, "kl": 0.023529052734375, "learning_rate": 4.21425619891302e-07, "loss": 0.0364, "num_tokens": 1652432812.0, "reward": 2.3683037757873535, "reward_std": 0.4116702377796173, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13629460334777832, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1016.6451416015625, "completions/mean_terminated_length": 775.1432495117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6335304459005914, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14003342049147827, "kl": 0.029815673828125, "learning_rate": 4.2110484495345165e-07, "loss": 0.0898, "num_tokens": 1652966173.0, "reward": 2.3325893878936768, "reward_std": 0.48819270730018616, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067808747291565, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.16542361676692963, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1138.493408203125, "completions/mean_terminated_length": 835.3244018554688, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6337435405678973, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12875176609317993, "kl": 0.02496337890625, "learning_rate": 4.207841413492038e-07, "loss": 0.0867, "num_tokens": 1653545210.0, "reward": 2.3822546005249023, "reward_std": 0.4378181993961334, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13494952023029327, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 973.8906860351562, "completions/mean_terminated_length": 740.3886108398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6339566352352033, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15653811729111977, "kl": 0.029510498046875, "learning_rate": 4.204635092560429e-07, "loss": 0.1461, "num_tokens": 1654047433.0, "reward": 2.42578125, "reward_std": 0.4407943785190582, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16000598669052124, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1114.2076416015625, "completions/mean_terminated_length": 817.5911865234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6341697299025092, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11354047711358363, "kl": 0.0225830078125, "learning_rate": 4.2014294885141476e-07, "loss": 0.0123, "num_tokens": 1654620742.0, "reward": 2.299107313156128, "reward_std": 0.39304473996162415, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1070.3817138671875, "completions/mean_terminated_length": 886.2678833007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6343828245698151, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12367071681383417, "kl": 0.024566650390625, "learning_rate": 4.198224603127245e-07, "loss": 0.084, "num_tokens": 1655182785.0, "reward": 2.228236675262451, "reward_std": 0.4465470612049103, "rewards/accuracy_reward/mean": 0.3325892984867096, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1063.328125, "completions/mean_terminated_length": 805.371826171875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6345959192371211, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1226016190428056, "kl": 0.025634765625, "learning_rate": 4.195020438173381e-07, "loss": 0.0527, "num_tokens": 1655723540.0, "reward": 2.3130581378936768, "reward_std": 0.40664881467819214, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14869897067546844, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 997.2678833007812, "completions/mean_terminated_length": 792.725341796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.634809013904427, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13436095208757137, "kl": 0.027313232421875, "learning_rate": 4.19181699542582e-07, "loss": 0.0892, "num_tokens": 1656245676.0, "reward": 2.4425225257873535, "reward_std": 0.44571688771247864, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15371058881282806, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 957.5000610351562, "completions/mean_terminated_length": 779.0545043945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.635022108571733, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1292508953312534, "kl": 0.027984619140625, "learning_rate": 4.188614276657416e-07, "loss": 0.0614, "num_tokens": 1656742396.0, "reward": 2.419642925262451, "reward_std": 0.4142501950263977, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10383255779743195, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 864.138427734375, "completions/mean_terminated_length": 718.7518920898438, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.6352352032390389, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13721727498447991, "kl": 0.03369140625, "learning_rate": 4.185412283640634e-07, "loss": 0.0899, "num_tokens": 1657203802.0, "reward": 2.560267925262451, "reward_std": 0.43181318044662476, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1132.3482666015625, "completions/mean_terminated_length": 852.0466918945312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6354482979063449, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11258348807334408, "kl": 0.023529052734375, "learning_rate": 4.182211018147528e-07, "loss": 0.0533, "num_tokens": 1657782230.0, "reward": 2.251674175262451, "reward_std": 0.45014962553977966, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13978439569473267, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1030.3326416015625, "completions/mean_terminated_length": 841.8756103515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6356613925736508, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10790544293355565, "kl": 0.025634765625, "learning_rate": 4.1790104819497575e-07, "loss": 0.0537, "num_tokens": 1658317547.0, "reward": 2.3543527126312256, "reward_std": 0.35733041167259216, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11938986927270889, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1005.2500610351562, "completions/mean_terminated_length": 812.1481323242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6358744872409567, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12395280872237067, "kl": 0.026214599609375, "learning_rate": 4.175810676818571e-07, "loss": 0.0883, "num_tokens": 1658834731.0, "reward": 2.412388563156128, "reward_std": 0.3492583632469177, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692163944244, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 915.2723388671875, "completions/mean_terminated_length": 729.9168701171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6360875819082628, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1347772509847454, "kl": 0.03082275390625, "learning_rate": 4.172611604524816e-07, "loss": 0.078, "num_tokens": 1659314821.0, "reward": 2.55078125, "reward_std": 0.3718591034412384, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11614610999822617, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 973.01123046875, "completions/mean_terminated_length": 773.9391479492188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6363006765755687, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14009595911792153, "kl": 0.026580810546875, "learning_rate": 4.1694132668389357e-07, "loss": 0.0983, "num_tokens": 1659824234.0, "reward": 2.4698662757873535, "reward_std": 0.42816421389579773, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.14693914353847504, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 878.6785888671875, "completions/mean_terminated_length": 728.4634399414062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6365137712428747, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12888184496619312, "kl": 0.031707763671875, "learning_rate": 4.166215665530964e-07, "loss": 0.0545, "num_tokens": 1660287930.0, "reward": 2.4308037757873535, "reward_std": 0.40837255120277405, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14955389499664307, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1041.5982666015625, "completions/mean_terminated_length": 805.9393920898438, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6367268659101806, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13356754723905398, "kl": 0.02508544921875, "learning_rate": 4.1630188023705306e-07, "loss": 0.1032, "num_tokens": 1660820950.0, "reward": 2.455357313156128, "reward_std": 0.5176732540130615, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.18192753195762634, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1006.4375610351562, "completions/mean_terminated_length": 800.3529663085938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6369399605774866, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12813688435146575, "kl": 0.02655029296875, "learning_rate": 4.159822679126852e-07, "loss": 0.081, "num_tokens": 1661335754.0, "reward": 2.4135046005249023, "reward_std": 0.46410855650901794, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1035.515625, "completions/mean_terminated_length": 801.8654174804688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6371530552447925, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12312479346631165, "kl": 0.027130126953125, "learning_rate": 4.1566272975687386e-07, "loss": 0.0925, "num_tokens": 1661874641.0, "reward": 2.37109375, "reward_std": 0.43238145112991333, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1310732364654541, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 979.9241333007812, "completions/mean_terminated_length": 775.39892578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6373661499120985, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12362543819219723, "kl": 0.02801513671875, "learning_rate": 4.153432659464591e-07, "loss": 0.0987, "num_tokens": 1662379135.0, "reward": 2.411830425262451, "reward_std": 0.37004172801971436, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15116052329540253, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 735.7098388671875, "completions/mean_terminated_length": 638.1535034179688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6375792445794044, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13963877361113958, "kl": 0.036102294921875, "learning_rate": 4.1502387665823915e-07, "loss": 0.0979, "num_tokens": 1662777293.0, "reward": 2.5652902126312256, "reward_std": 0.36679255962371826, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.0952264666557312, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 877.5178833007812, "completions/mean_terminated_length": 699.9896850585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6377923392467103, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13946368758263236, "kl": 0.030181884765625, "learning_rate": 4.147045620689723e-07, "loss": 0.0771, "num_tokens": 1663235397.0, "reward": 2.549107313156128, "reward_std": 0.45275506377220154, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1378791630268097, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1050.169677734375, "completions/mean_terminated_length": 759.73486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6380054339140163, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13047985088118638, "kl": 0.02593994140625, "learning_rate": 4.1438532235537417e-07, "loss": 0.0699, "num_tokens": 1663780209.0, "reward": 2.3911831378936768, "reward_std": 0.4369259476661682, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 969.6629638671875, "completions/mean_terminated_length": 766.5808715820312, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6382185285813222, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12311184859582003, "kl": 0.0267333984375, "learning_rate": 4.1406615769411977e-07, "loss": 0.0311, "num_tokens": 1664282010.0, "reward": 2.412388563156128, "reward_std": 0.4343810975551605, "rewards/accuracy_reward/mean": 0.5046296119689941, "rewards/accuracy_reward/std": 0.5005582571029663, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.1137678399682045, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 779.0580444335938, "completions/mean_terminated_length": 647.7881469726562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6384316232486282, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16943548910667572, "kl": 0.03753662109375, "learning_rate": 4.1374706826184225e-07, "loss": 0.0858, "num_tokens": 1664702772.0, "reward": 2.58203125, "reward_std": 0.4265348017215729, "rewards/accuracy_reward/mean": 0.6763392686843872, "rewards/accuracy_reward/std": 0.46839532256126404, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1107.15625, "completions/mean_terminated_length": 826.2667236328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6386447179159341, "frac_reward_zero_std": 0.0, "grad_norm": 0.13644054718963786, "kl": 0.024658203125, "learning_rate": 4.1342805423513317e-07, "loss": 0.108, "num_tokens": 1665273226.0, "reward": 2.2974331378936768, "reward_std": 0.46128368377685547, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764793753623962, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 898.9576416015625, "completions/mean_terminated_length": 714.3963623046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6388578125832401, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13336070491661936, "kl": 0.03216552734375, "learning_rate": 4.131091157905423e-07, "loss": 0.0359, "num_tokens": 1665741879.0, "reward": 2.51171875, "reward_std": 0.4706316888332367, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14720547199249268, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 871.2098388671875, "completions/mean_terminated_length": 696.2000122070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.639070907250546, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14377150069264016, "kl": 0.030792236328125, "learning_rate": 4.127902531045778e-07, "loss": 0.0572, "num_tokens": 1666199253.0, "reward": 2.482142925262451, "reward_std": 0.37498924136161804, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14714714884757996, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1201.6317138671875, "completions/mean_terminated_length": 909.3423461914062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.639284001917852, "frac_reward_zero_std": 0.0, "grad_norm": 0.11887372512125888, "kl": 0.021759033203125, "learning_rate": 4.1247146635370567e-07, "loss": 0.0931, "num_tokens": 1666816496.0, "reward": 2.283482313156128, "reward_std": 0.519429087638855, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.17261767387390137, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 937.1897583007812, "completions/mean_terminated_length": 752.0546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.639497096585158, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13358804861762266, "kl": 0.02587890625, "learning_rate": 4.1215275571435014e-07, "loss": 0.07, "num_tokens": 1667302181.0, "reward": 2.3549108505249023, "reward_std": 0.3632870614528656, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10013572871685028, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1167.5848388671875, "completions/mean_terminated_length": 863.5375366210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6397101912524639, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1081429511955512, "kl": 0.0240478515625, "learning_rate": 4.1183412136289287e-07, "loss": 0.0574, "num_tokens": 1667901947.0, "reward": 2.294642925262451, "reward_std": 0.47484898567199707, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.175273135304451, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 919.2678833007812, "completions/mean_terminated_length": 731.1458740234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6399232859197699, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13105404150471958, "kl": 0.0269775390625, "learning_rate": 4.115155634756738e-07, "loss": 0.0256, "num_tokens": 1668383539.0, "reward": 2.3984375, "reward_std": 0.35650792717933655, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.1075822189450264, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1048.0648193359375, "completions/mean_terminated_length": 793.1792602539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6401363805870758, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11780557426247855, "kl": 0.0262451171875, "learning_rate": 4.111970822289902e-07, "loss": 0.0367, "num_tokens": 1668930976.0, "reward": 2.3755581378936768, "reward_std": 0.41952234506607056, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.146857351064682, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 968.0670166015625, "completions/mean_terminated_length": 747.4354858398438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6403494752543818, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11940137028206221, "kl": 0.028045654296875, "learning_rate": 4.1087867779909713e-07, "loss": 0.0585, "num_tokens": 1669441358.0, "reward": 2.4107143878936768, "reward_std": 0.4208201766014099, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13526484370231628, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 942.466552734375, "completions/mean_terminated_length": 764.8937377929688, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6405625699216877, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.26227139932313126, "kl": 0.02886962890625, "learning_rate": 4.1056035036220716e-07, "loss": 0.0512, "num_tokens": 1669941551.0, "reward": 2.36328125, "reward_std": 0.4572998881340027, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 885.185302734375, "completions/mean_terminated_length": 729.1620483398438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6407756645889937, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13052642975409745, "kl": 0.029083251953125, "learning_rate": 4.102421000944899e-07, "loss": 0.0947, "num_tokens": 1670399810.0, "reward": 2.4916296005249023, "reward_std": 0.44786086678504944, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1339094191789627, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 963.716552734375, "completions/mean_terminated_length": 731.5799560546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6409887592562996, "frac_reward_zero_std": 0.0, "grad_norm": 0.13308879468570403, "kl": 0.029327392578125, "learning_rate": 4.099239271720729e-07, "loss": 0.0854, "num_tokens": 1670898771.0, "reward": 2.3822546005249023, "reward_std": 0.4310140907764435, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14249980449676514, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1009.6272583007812, "completions/mean_terminated_length": 797.486572265625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.6412018539236055, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13130962357303472, "kl": 0.02606201171875, "learning_rate": 4.0960583177104e-07, "loss": 0.0816, "num_tokens": 1671425388.0, "reward": 2.4637277126312256, "reward_std": 0.5023805499076843, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.13192766904830933, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 916.5938110351562, "completions/mean_terminated_length": 761.5278930664062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6414149485909115, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14477767232149627, "kl": 0.030242919921875, "learning_rate": 4.092878140674333e-07, "loss": 0.0998, "num_tokens": 1671911622.0, "reward": 2.400111675262451, "reward_std": 0.40901997685432434, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10593781620264053, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 822.9420166015625, "completions/mean_terminated_length": 644.3529663085938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6416280432582174, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14745979197921935, "kl": 0.033050537109375, "learning_rate": 4.089698742372506e-07, "loss": 0.0973, "num_tokens": 1672344988.0, "reward": 2.3934152126312256, "reward_std": 0.48596134781837463, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1810806840658188, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 929.8527221679688, "completions/mean_terminated_length": 786.2115478515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6418411379255234, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14197540583233784, "kl": 0.03289794921875, "learning_rate": 4.086520124564479e-07, "loss": 0.0734, "num_tokens": 1672824474.0, "reward": 2.462611675262451, "reward_std": 0.41160017251968384, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1046096533536911, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 989.6942138671875, "completions/mean_terminated_length": 734.6453857421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6420542325928293, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13739094661220477, "kl": 0.027679443359375, "learning_rate": 4.0833422890093684e-07, "loss": 0.0814, "num_tokens": 1673345745.0, "reward": 2.474330425262451, "reward_std": 0.3747579753398895, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 989.7879638671875, "completions/mean_terminated_length": 793.82275390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6422673272601354, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.12136133372120773, "kl": 0.025543212890625, "learning_rate": 4.0801652374658644e-07, "loss": 0.0366, "num_tokens": 1673864866.0, "reward": 2.34375, "reward_std": 0.2922419011592865, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.19128035008907318, "rewards/tag_count_reward/mean": 0.9888392686843872, "rewards/tag_count_reward/std": 0.07941616326570511, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1012.5938110351562, "completions/mean_terminated_length": 794.3189086914062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6424804219274413, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11656234838462415, "kl": 0.02606201171875, "learning_rate": 4.0769889716922247e-07, "loss": 0.0654, "num_tokens": 1674389180.0, "reward": 2.4056921005249023, "reward_std": 0.3991907238960266, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962119698524475, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1086.24560546875, "completions/mean_terminated_length": 861.0413208007812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6426935165947473, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15186825284309066, "kl": 0.028350830078125, "learning_rate": 4.0738134934462643e-07, "loss": 0.073, "num_tokens": 1674951178.0, "reward": 2.4324777126312256, "reward_std": 0.41315987706184387, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12245608866214752, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1045.6785888671875, "completions/mean_terminated_length": 797.1921997070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6429066112620532, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12608514431960724, "kl": 0.025177001953125, "learning_rate": 4.070638804485371e-07, "loss": 0.0635, "num_tokens": 1675486458.0, "reward": 2.2879464626312256, "reward_std": 0.39923498034477234, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1034.946533203125, "completions/mean_terminated_length": 787.3111572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6431197059293591, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12085812751618726, "kl": 0.02960205078125, "learning_rate": 4.0674649065664925e-07, "loss": 0.0499, "num_tokens": 1676015522.0, "reward": 2.4263393878936768, "reward_std": 0.4028944671154022, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1136813834309578, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 925.1094360351562, "completions/mean_terminated_length": 751.4664916992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6433328005966651, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14452928978424123, "kl": 0.0291748046875, "learning_rate": 4.064291801446136e-07, "loss": 0.0825, "num_tokens": 1676499539.0, "reward": 2.4921875, "reward_std": 0.43308025598526, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15116052329540253, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 906.6920166015625, "completions/mean_terminated_length": 779.2506103515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.643545895263971, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1328273240657277, "kl": 0.03228759765625, "learning_rate": 4.0611194908803727e-07, "loss": 0.0879, "num_tokens": 1676974329.0, "reward": 2.5072546005249023, "reward_std": 0.40214458107948303, "rewards/accuracy_reward/mean": 0.6064814925193787, "rewards/accuracy_reward/std": 0.4890965521335602, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1327671855688095, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 945.3683471679688, "completions/mean_terminated_length": 727.2005615234375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.643758989931277, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12488816663588193, "kl": 0.0299072265625, "learning_rate": 4.057947976624835e-07, "loss": 0.0501, "num_tokens": 1677470494.0, "reward": 2.3565850257873535, "reward_std": 0.40017497539520264, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14801737666130066, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 934.075927734375, "completions/mean_terminated_length": 765.1259765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.6439720845985829, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11737828123114541, "kl": 0.027679443359375, "learning_rate": 4.0547772604347117e-07, "loss": 0.0369, "num_tokens": 1677960896.0, "reward": 2.467076063156128, "reward_std": 0.3626514673233032, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801211535930634, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 878.63623046875, "completions/mean_terminated_length": 731.7311401367188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6441851792658889, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.134153765149431, "kl": 0.029449462890625, "learning_rate": 4.0516073440647525e-07, "loss": 0.0699, "num_tokens": 1678431933.0, "reward": 2.4776787757873535, "reward_std": 0.4417393207550049, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1037.4263916015625, "completions/mean_terminated_length": 817.7364501953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6443982739331948, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13568530261049974, "kl": 0.027923583984375, "learning_rate": 4.0484382292692643e-07, "loss": 0.1408, "num_tokens": 1678969788.0, "reward": 2.4190850257873535, "reward_std": 0.4392755627632141, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920445680618286, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 999.7232666015625, "completions/mean_terminated_length": 802.3023681640625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6446113686005007, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12795894670591043, "kl": 0.028717041015625, "learning_rate": 4.0452699178021076e-07, "loss": 0.0696, "num_tokens": 1679490176.0, "reward": 2.4425225257873535, "reward_std": 0.48326146602630615, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13180458545684814, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1081.43310546875, "completions/mean_terminated_length": 871.309814453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6448244632678067, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1260343593187523, "kl": 0.02630615234375, "learning_rate": 4.0421024114167014e-07, "loss": 0.0902, "num_tokens": 1680040786.0, "reward": 2.32421875, "reward_std": 0.4389100670814514, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15279825031757355, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 934.4285888671875, "completions/mean_terminated_length": 748.8333740234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6450375579351126, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1536566109088807, "kl": 0.032318115234375, "learning_rate": 4.038935711866019e-07, "loss": 0.099, "num_tokens": 1680532962.0, "reward": 2.4324777126312256, "reward_std": 0.4652644991874695, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12540756165981293, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 945.0357666015625, "completions/mean_terminated_length": 747.6632080078125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6452506526024187, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13023979641259983, "kl": 0.0281982421875, "learning_rate": 4.035769820902584e-07, "loss": 0.0833, "num_tokens": 1681034914.0, "reward": 2.3560268878936768, "reward_std": 0.4603950083255768, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1781519651412964, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 925.0647583007812, "completions/mean_terminated_length": 720.6253662109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6454637472697246, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12784006870300718, "kl": 0.03033447265625, "learning_rate": 4.032604740278478e-07, "loss": 0.0729, "num_tokens": 1681514447.0, "reward": 2.5106027126312256, "reward_std": 0.42938828468322754, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692163944244, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1056.029052734375, "completions/mean_terminated_length": 820.367431640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6456768419370306, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.48292507424810344, "kl": 0.041961669921875, "learning_rate": 4.0294404717453267e-07, "loss": 0.0456, "num_tokens": 1682052940.0, "reward": 2.338169813156128, "reward_std": 0.4858367443084717, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15247619152069092, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 984.3750610351562, "completions/mean_terminated_length": 773.9251708984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.6458899366043365, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12674738979150268, "kl": 0.028472900390625, "learning_rate": 4.0262770170543124e-07, "loss": 0.0632, "num_tokens": 1682563700.0, "reward": 2.459263563156128, "reward_std": 0.4306187033653259, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13878051936626434, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 929.185302734375, "completions/mean_terminated_length": 735.8822021484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6461030312716425, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1448916631642093, "kl": 0.03021240234375, "learning_rate": 4.023114377956166e-07, "loss": 0.1153, "num_tokens": 1683042695.0, "reward": 2.5011162757873535, "reward_std": 0.38418418169021606, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1046.107177734375, "completions/mean_terminated_length": 790.7227172851562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6463161259389484, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1438695046424049, "kl": 0.0260009765625, "learning_rate": 4.019952556201165e-07, "loss": 0.1086, "num_tokens": 1683595239.0, "reward": 2.3035714626312256, "reward_std": 0.5223547220230103, "rewards/accuracy_reward/mean": 0.46759259700775146, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17041954398155212, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1098.0692138671875, "completions/mean_terminated_length": 869.1384887695312, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.6465292206062543, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11443706410760887, "kl": 0.024505615234375, "learning_rate": 4.016791553539137e-07, "loss": 0.0755, "num_tokens": 1684149590.0, "reward": 2.4090402126312256, "reward_std": 0.45912063121795654, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16318118572235107, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 979.5513916015625, "completions/mean_terminated_length": 764.7158203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6467423152735603, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14979330264254792, "kl": 0.032073974609375, "learning_rate": 4.0136313717194524e-07, "loss": 0.0806, "num_tokens": 1684663677.0, "reward": 2.4034600257873535, "reward_std": 0.4430224299430847, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1071.53125, "completions/mean_terminated_length": 878.3262329101562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6469554099408662, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13476884764052116, "kl": 0.02734375, "learning_rate": 4.010472012491034e-07, "loss": 0.1281, "num_tokens": 1685216379.0, "reward": 2.2896206378936768, "reward_std": 0.5698955655097961, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.1743151992559433, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1078.665283203125, "completions/mean_terminated_length": 796.5244750976562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6471685046081722, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10901786066654653, "kl": 0.02227783203125, "learning_rate": 4.0073134776023434e-07, "loss": 0.0186, "num_tokens": 1685770293.0, "reward": 2.376674175262451, "reward_std": 0.34220561385154724, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12130890041589737, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 899.74560546875, "completions/mean_terminated_length": 718.7545166015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6473815992754781, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15556613632311114, "kl": 0.030029296875, "learning_rate": 4.004155768801385e-07, "loss": 0.1193, "num_tokens": 1686239715.0, "reward": 2.4419643878936768, "reward_std": 0.4940684139728546, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.170989990234375, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 884.3438110351562, "completions/mean_terminated_length": 711.2872314453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6475946939427841, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1563064627210784, "kl": 0.031494140625, "learning_rate": 4.0009988878357123e-07, "loss": 0.0489, "num_tokens": 1686697773.0, "reward": 2.489955425262451, "reward_std": 0.40380969643592834, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.10225148499011993, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 975.122802734375, "completions/mean_terminated_length": 727.5357055664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.64780778861009, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14172186747070342, "kl": 0.02911376953125, "learning_rate": 3.9978428364524166e-07, "loss": 0.1, "num_tokens": 1687204932.0, "reward": 2.3331475257873535, "reward_std": 0.4661312699317932, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15279822051525116, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 942.0067138671875, "completions/mean_terminated_length": 730.220703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6480208832773959, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13190541895586252, "kl": 0.027496337890625, "learning_rate": 3.9946876163981303e-07, "loss": 0.0736, "num_tokens": 1687696311.0, "reward": 2.45703125, "reward_std": 0.46261462569236755, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13359208405017853, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1049.2410888671875, "completions/mean_terminated_length": 798.1563720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.648233977944702, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.32176250662639433, "kl": 0.030181884765625, "learning_rate": 3.9915332294190287e-07, "loss": 0.0758, "num_tokens": 1688242467.0, "reward": 2.263951063156128, "reward_std": 0.49381107091903687, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17896561324596405, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 963.7053833007812, "completions/mean_terminated_length": 766.3008422851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6484470726120078, "frac_reward_zero_std": 0.0, "grad_norm": 0.12707168109074957, "kl": 0.028472900390625, "learning_rate": 3.988379677260818e-07, "loss": 0.0645, "num_tokens": 1688746399.0, "reward": 2.385044813156128, "reward_std": 0.5021392703056335, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.16051718592643738, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 874.1160888671875, "completions/mean_terminated_length": 752.6798095703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6486601672793139, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13668350349310235, "kl": 0.03082275390625, "learning_rate": 3.985226961668754e-07, "loss": 0.0975, "num_tokens": 1689207411.0, "reward": 2.5066964626312256, "reward_std": 0.43065667152404785, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1168653815984726, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 896.24560546875, "completions/mean_terminated_length": 745.0050659179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6488732619466198, "frac_reward_zero_std": 0.0, "grad_norm": 0.1516798701860554, "kl": 0.03009033203125, "learning_rate": 3.982075084387617e-07, "loss": 0.0945, "num_tokens": 1689676881.0, "reward": 2.5072546005249023, "reward_std": 0.48922237753868103, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1146.5692138671875, "completions/mean_terminated_length": 874.0435791015625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6490863566139258, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12001113937709317, "kl": 0.02227783203125, "learning_rate": 3.978924047161738e-07, "loss": 0.0339, "num_tokens": 1690272240.0, "reward": 2.2527902126312256, "reward_std": 0.42920854687690735, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11682131141424179, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1036.9398193359375, "completions/mean_terminated_length": 823.7973022460938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6492994512812317, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12153460054745978, "kl": 0.02630615234375, "learning_rate": 3.975773851734967e-07, "loss": 0.0736, "num_tokens": 1690808165.0, "reward": 2.23828125, "reward_std": 0.4820942282676697, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 998.97998046875, "completions/mean_terminated_length": 760.4356079101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6495125459485377, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1341528321485664, "kl": 0.0277099609375, "learning_rate": 3.972624499850703e-07, "loss": 0.093, "num_tokens": 1691327916.0, "reward": 2.4129464626312256, "reward_std": 0.49112632870674133, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 968.41748046875, "completions/mean_terminated_length": 733.7255859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6497256406158436, "frac_reward_zero_std": 0.0, "grad_norm": 0.15143826049676645, "kl": 0.029449462890625, "learning_rate": 3.9694759932518663e-07, "loss": 0.1041, "num_tokens": 1691828919.0, "reward": 2.3856027126312256, "reward_std": 0.5096012353897095, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.17171035706996918, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 956.0848388671875, "completions/mean_terminated_length": 774.0989990234375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6499387352831495, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14919818462344184, "kl": 0.0283203125, "learning_rate": 3.966328333680914e-07, "loss": 0.0992, "num_tokens": 1692314589.0, "reward": 2.3900671005249023, "reward_std": 0.5258574485778809, "rewards/accuracy_reward/mean": 0.5601851940155029, "rewards/accuracy_reward/std": 0.496940016746521, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14461299777030945, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 967.3951416015625, "completions/mean_terminated_length": 760.470703125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6501518299504555, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13839694249467505, "kl": 0.02984619140625, "learning_rate": 3.963181522879837e-07, "loss": 0.0471, "num_tokens": 1692812158.0, "reward": 2.439174175262451, "reward_std": 0.42624610662460327, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16374634206295013, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 932.8438110351562, "completions/mean_terminated_length": 746.984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6503649246177614, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14212434964174786, "kl": 0.029876708984375, "learning_rate": 3.960035562590154e-07, "loss": 0.0689, "num_tokens": 1693297336.0, "reward": 2.4458706378936768, "reward_std": 0.4433249235153198, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14469929039478302, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1057.8304443359375, "completions/mean_terminated_length": 819.2022094726562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6505780192850674, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12727239781152336, "kl": 0.025146484375, "learning_rate": 3.956890454552914e-07, "loss": 0.0377, "num_tokens": 1693845564.0, "reward": 2.373326063156128, "reward_std": 0.36856740713119507, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13335825502872467, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1008.7969360351562, "completions/mean_terminated_length": 772.4849243164062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6507911139523733, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.5756412279788655, "kl": 0.07489013671875, "learning_rate": 3.953746200508693e-07, "loss": 0.1053, "num_tokens": 1694376721.0, "reward": 2.3448662757873535, "reward_std": 0.4737739562988281, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1042.294677734375, "completions/mean_terminated_length": 789.4636840820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6510042086196793, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13506602384235966, "kl": 0.026031494140625, "learning_rate": 3.950602802197591e-07, "loss": 0.0438, "num_tokens": 1694919525.0, "reward": 2.3470983505249023, "reward_std": 0.46464523673057556, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.10627460479736328, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1043.415283203125, "completions/mean_terminated_length": 794.36767578125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6512173032869852, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.20849132946117305, "kl": 0.048126220703125, "learning_rate": 3.9474602613592454e-07, "loss": 0.0759, "num_tokens": 1695464399.0, "reward": 2.40234375, "reward_std": 0.4604419469833374, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1581934094429016, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 941.2678833007812, "completions/mean_terminated_length": 711.5687255859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6514303979542911, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1366623426241431, "kl": 0.02801513671875, "learning_rate": 3.944318579732805e-07, "loss": 0.0802, "num_tokens": 1695954887.0, "reward": 2.326451063156128, "reward_std": 0.396142840385437, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 821.6116333007812, "completions/mean_terminated_length": 688.0445556640625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6516434926215972, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13574468995611583, "kl": 0.030792236328125, "learning_rate": 3.941177759056955e-07, "loss": 0.047, "num_tokens": 1696395305.0, "reward": 2.5658483505249023, "reward_std": 0.37883028388023376, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.08887429535388947, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 953.91748046875, "completions/mean_terminated_length": 744.4122314453125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.651856587288903, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14094800299288607, "kl": 0.028350830078125, "learning_rate": 3.938037801069898e-07, "loss": 0.054, "num_tokens": 1696900932.0, "reward": 2.3638393878936768, "reward_std": 0.34052330255508423, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.09060624986886978, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1004.591552734375, "completions/mean_terminated_length": 804.7898559570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6520696819562091, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12427462737778618, "kl": 0.02813720703125, "learning_rate": 3.9348987075093596e-07, "loss": 0.0713, "num_tokens": 1697420317.0, "reward": 2.454799175262451, "reward_std": 0.4182373285293579, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 930.6964721679688, "completions/mean_terminated_length": 783.9797973632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.652282776623515, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14158414708276093, "kl": 0.030609130859375, "learning_rate": 3.93176048011259e-07, "loss": 0.0928, "num_tokens": 1697902149.0, "reward": 2.455357313156128, "reward_std": 0.49798741936683655, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1486160308122635, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 915.21435546875, "completions/mean_terminated_length": 753.3877563476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.652495871290821, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13325849960618702, "kl": 0.030303955078125, "learning_rate": 3.928623120616353e-07, "loss": 0.0765, "num_tokens": 1698379669.0, "reward": 2.4302456378936768, "reward_std": 0.47848281264305115, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13566918671131134, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1104.8638916015625, "completions/mean_terminated_length": 823.2898559570312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6527089659581269, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12039531062356246, "kl": 0.026123046875, "learning_rate": 3.9254866307569433e-07, "loss": 0.0348, "num_tokens": 1698938328.0, "reward": 2.353236675262451, "reward_std": 0.3765774667263031, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 942.83935546875, "completions/mean_terminated_length": 768.640869140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6529220606254329, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12756613966517744, "kl": 0.02801513671875, "learning_rate": 3.922351012270162e-07, "loss": 0.0787, "num_tokens": 1699431712.0, "reward": 2.48828125, "reward_std": 0.44719505310058594, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.49405214190483093, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12870889902114868, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1030.1473388671875, "completions/mean_terminated_length": 791.8071899414062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6531351552927388, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1261735607909035, "kl": 0.025970458984375, "learning_rate": 3.919216266891339e-07, "loss": 0.0856, "num_tokens": 1699959842.0, "reward": 2.4229912757873535, "reward_std": 0.4391017258167267, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15610110759735107, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 941.529052734375, "completions/mean_terminated_length": 773.7095336914062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6533482499600447, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13295850986629326, "kl": 0.029754638671875, "learning_rate": 3.9160823963553127e-07, "loss": 0.0145, "num_tokens": 1700452111.0, "reward": 2.4246652126312256, "reward_std": 0.41219159960746765, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1240563914179802, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1031.96875, "completions/mean_terminated_length": 800.926025390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6535613446273507, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12957773420062366, "kl": 0.025299072265625, "learning_rate": 3.91294940239644e-07, "loss": 0.0852, "num_tokens": 1700990177.0, "reward": 2.4793527126312256, "reward_std": 0.40322282910346985, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780036240816116, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1109.654052734375, "completions/mean_terminated_length": 860.4887084960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6537744392946566, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12357394879429899, "kl": 0.023834228515625, "learning_rate": 3.909817286748597e-07, "loss": 0.0596, "num_tokens": 1701560230.0, "reward": 2.4609375, "reward_std": 0.45786532759666443, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559327840805, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 937.3370971679688, "completions/mean_terminated_length": 738.5868530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6539875339619626, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13459269312145158, "kl": 0.028900146484375, "learning_rate": 3.906686051145166e-07, "loss": 0.1433, "num_tokens": 1702046589.0, "reward": 2.404576063156128, "reward_std": 0.4564734399318695, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.15706878900527954, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 909.7500610351562, "completions/mean_terminated_length": 688.170654296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6542006286292685, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15120348125988642, "kl": 0.031402587890625, "learning_rate": 3.9035556973190484e-07, "loss": 0.0631, "num_tokens": 1702523101.0, "reward": 2.4810268878936768, "reward_std": 0.365992933511734, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1095.4129638671875, "completions/mean_terminated_length": 881.9917602539062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6544137232965745, "frac_reward_zero_std": 0.0, "grad_norm": 0.12144369579231543, "kl": 0.02325439453125, "learning_rate": 3.9004262270026543e-07, "loss": 0.0242, "num_tokens": 1703088006.0, "reward": 2.2578125, "reward_std": 0.4320054054260254, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.16310946643352509, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1036.529052734375, "completions/mean_terminated_length": 823.300048828125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6546268179638804, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10870045352907412, "kl": 0.02581787109375, "learning_rate": 3.897297641927909e-07, "loss": 0.088, "num_tokens": 1703627971.0, "reward": 2.4419643878936768, "reward_std": 0.3953354060649872, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 895.8638916015625, "completions/mean_terminated_length": 776.6773071289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6548399126311863, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2536314090854561, "kl": 0.039398193359375, "learning_rate": 3.894169943826242e-07, "loss": 0.0411, "num_tokens": 1704106774.0, "reward": 2.650669813156128, "reward_std": 0.3904971778392792, "rewards/accuracy_reward/mean": 0.7366071343421936, "rewards/accuracy_reward/std": 0.44096609950065613, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11589459329843521, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 916.1428833007812, "completions/mean_terminated_length": 724.05224609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6550530072984924, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.134044725451534, "kl": 0.030120849609375, "learning_rate": 3.891043134428593e-07, "loss": 0.0941, "num_tokens": 1704588534.0, "reward": 2.3989956378936768, "reward_std": 0.39790162444114685, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11088934540748596, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 987.8928833007812, "completions/mean_terminated_length": 814.4207763671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6552661019657983, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12312706404244447, "kl": 0.027008056640625, "learning_rate": 3.8879172154654163e-07, "loss": 0.0796, "num_tokens": 1705096070.0, "reward": 2.506138563156128, "reward_std": 0.4249289333820343, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1258348822593689, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1078.71435546875, "completions/mean_terminated_length": 851.74658203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6554791966331043, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1260331228238506, "kl": 0.0240478515625, "learning_rate": 3.8847921886666634e-07, "loss": 0.097, "num_tokens": 1705651174.0, "reward": 2.353236675262451, "reward_std": 0.4320808947086334, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16628077626228333, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 901.5469360351562, "completions/mean_terminated_length": 757.5200805664062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6556922913004102, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12905305697282277, "kl": 0.029510498046875, "learning_rate": 3.881668055761803e-07, "loss": 0.0864, "num_tokens": 1706125259.0, "reward": 2.5150671005249023, "reward_std": 0.395595908164978, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.12055531144142151, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 950.7723388671875, "completions/mean_terminated_length": 797.21630859375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6559053859677162, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1183443618691964, "kl": 0.02716064453125, "learning_rate": 3.8785448184798006e-07, "loss": 0.1062, "num_tokens": 1706623509.0, "reward": 2.4693081378936768, "reward_std": 0.4127633273601532, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.2463276982307434, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1119314506649971, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 989.0223388671875, "completions/mean_terminated_length": 782.8746337890625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6561184806350221, "frac_reward_zero_std": 0.25, "grad_norm": 0.11486868110023093, "kl": 0.027587890625, "learning_rate": 3.8754224785491283e-07, "loss": 0.0524, "num_tokens": 1707138143.0, "reward": 2.513392925262451, "reward_std": 0.3562312424182892, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12733516097068787, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1059.122802734375, "completions/mean_terminated_length": 803.5702514648438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6563315753023281, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11817245490244976, "kl": 0.02557373046875, "learning_rate": 3.872301037697766e-07, "loss": 0.1182, "num_tokens": 1707679286.0, "reward": 2.3515625, "reward_std": 0.36329910159111023, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 960.7969360351562, "completions/mean_terminated_length": 779.5963745117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.656544669969634, "frac_reward_zero_std": 0.25, "grad_norm": 0.11746313667325094, "kl": 0.029205322265625, "learning_rate": 3.869180497653186e-07, "loss": 0.0389, "num_tokens": 1708177723.0, "reward": 2.400111675262451, "reward_std": 0.36465469002723694, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13170984387397766, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 892.107177734375, "completions/mean_terminated_length": 763.0372314453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6567577646369399, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1250789780598846, "kl": 0.030181884765625, "learning_rate": 3.866060860142375e-07, "loss": 0.0682, "num_tokens": 1708644411.0, "reward": 2.6411831378936768, "reward_std": 0.3977411389350891, "rewards/accuracy_reward/mean": 0.7075892686843872, "rewards/accuracy_reward/std": 0.4553784728050232, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.086386539041996, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 962.6473388671875, "completions/mean_terminated_length": 751.3652954101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6569708593042459, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12350389008524336, "kl": 0.02972412109375, "learning_rate": 3.862942126891809e-07, "loss": 0.0491, "num_tokens": 1709141837.0, "reward": 2.3353796005249023, "reward_std": 0.39777690172195435, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1514935940504074, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1149.763427734375, "completions/mean_terminated_length": 857.4378662109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6571839539715518, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1270818929851658, "kl": 0.0235595703125, "learning_rate": 3.859824299627469e-07, "loss": 0.1207, "num_tokens": 1709728435.0, "reward": 2.2935268878936768, "reward_std": 0.5091504454612732, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16599240899085999, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1001.075927734375, "completions/mean_terminated_length": 810.4749755859375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6573970486388578, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12347514829414148, "kl": 0.02777099609375, "learning_rate": 3.856707380074836e-07, "loss": 0.0734, "num_tokens": 1710244245.0, "reward": 2.4034600257873535, "reward_std": 0.4112948477268219, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15933358669281006, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 931.0848388671875, "completions/mean_terminated_length": 764.9794921875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6576101433061637, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1441042502814212, "kl": 0.02874755859375, "learning_rate": 3.853591369958884e-07, "loss": 0.1047, "num_tokens": 1710726123.0, "reward": 2.5150671005249023, "reward_std": 0.4763098359107971, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13878051936626434, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1082.274658203125, "completions/mean_terminated_length": 856.1405029296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6578232379734698, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13085358210702872, "kl": 0.0257568359375, "learning_rate": 3.850476271004087e-07, "loss": 0.0768, "num_tokens": 1711287750.0, "reward": 2.2974331378936768, "reward_std": 0.47224316000938416, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15107274055480957, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 827.107177734375, "completions/mean_terminated_length": 704.117919921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6580363326407757, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12891073604438358, "kl": 0.03436279296875, "learning_rate": 3.8473620849344127e-07, "loss": 0.0059, "num_tokens": 1711726934.0, "reward": 2.588169813156128, "reward_std": 0.40565165877342224, "rewards/accuracy_reward/mean": 0.6808035969734192, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11468179523944855, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 963.2500610351562, "completions/mean_terminated_length": 755.5319213867188, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.6582494273080817, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11382157880361331, "kl": 0.02630615234375, "learning_rate": 3.8442488134733276e-07, "loss": 0.0075, "num_tokens": 1712227366.0, "reward": 2.5122768878936768, "reward_std": 0.36341267824172974, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9899553656578064, "rewards/tag_count_reward/std": 0.07219472527503967, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 919.6138916015625, "completions/mean_terminated_length": 774.6574096679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6584625219753876, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12375196769218905, "kl": 0.031982421875, "learning_rate": 3.8411364583437876e-07, "loss": 0.0507, "num_tokens": 1712706649.0, "reward": 2.470982313156128, "reward_std": 0.38491514325141907, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1486160308122635, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 907.1741333007812, "completions/mean_terminated_length": 723.9326171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6586756166426935, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1417443310356198, "kl": 0.027923583984375, "learning_rate": 3.838025021268241e-07, "loss": 0.0731, "num_tokens": 1713185959.0, "reward": 2.4581475257873535, "reward_std": 0.42494910955429077, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10500273108482361, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1062.625, "completions/mean_terminated_length": 858.1131591796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6588887113099995, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272366908118341, "kl": 0.025482177734375, "learning_rate": 3.8349145039686317e-07, "loss": 0.0446, "num_tokens": 1713730271.0, "reward": 2.3565850257873535, "reward_std": 0.4734190106391907, "rewards/accuracy_reward/mean": 0.48842594027519226, "rewards/accuracy_reward/std": 0.500445544719696, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12992529571056366, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1078.732177734375, "completions/mean_terminated_length": 821.35595703125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6591018059773054, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.8727786147800605, "kl": 0.025543212890625, "learning_rate": 3.831804908166393e-07, "loss": 0.0516, "num_tokens": 1714300007.0, "reward": 2.3560268878936768, "reward_std": 0.3809712529182434, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1401200145483017, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1038.328125, "completions/mean_terminated_length": 791.5194702148438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6593149006446114, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13565806163532326, "kl": 0.02777099609375, "learning_rate": 3.8286962355824495e-07, "loss": 0.0931, "num_tokens": 1714838922.0, "reward": 2.4090402126312256, "reward_std": 0.4496128559112549, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12450840324163437, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 889.5848388671875, "completions/mean_terminated_length": 720.7109985351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6595279953119173, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13736879986117598, "kl": 0.03167724609375, "learning_rate": 3.825588487937211e-07, "loss": 0.0386, "num_tokens": 1715302832.0, "reward": 2.482142925262451, "reward_std": 0.40892651677131653, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13271193206310272, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1093.77685546875, "completions/mean_terminated_length": 853.8882446289062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6597410899792233, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12639586455013752, "kl": 0.024322509765625, "learning_rate": 3.82248166695058e-07, "loss": 0.0959, "num_tokens": 1715865532.0, "reward": 2.3939733505249023, "reward_std": 0.4523405432701111, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1479213982820511, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 904.7902221679688, "completions/mean_terminated_length": 734.7743530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6599541846465292, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12700185227354238, "kl": 0.032806396484375, "learning_rate": 3.819375774341944e-07, "loss": 0.0374, "num_tokens": 1716342062.0, "reward": 2.4966518878936768, "reward_std": 0.38179266452789307, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13911856710910797, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1004.0402221679688, "completions/mean_terminated_length": 759.5867919921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6601672793138351, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13718327695824983, "kl": 0.028594970703125, "learning_rate": 3.8162708118301736e-07, "loss": 0.096, "num_tokens": 1716855728.0, "reward": 2.4832589626312256, "reward_std": 0.41527578234672546, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1599874496459961, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 968.22998046875, "completions/mean_terminated_length": 804.4601440429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6603803739811411, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1286667466469969, "kl": 0.027984619140625, "learning_rate": 3.8131667811336334e-07, "loss": 0.063, "num_tokens": 1717351223.0, "reward": 2.474330425262451, "reward_std": 0.4458337724208832, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1020.3192138671875, "completions/mean_terminated_length": 754.73876953125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.660593468648447, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12509926888190004, "kl": 0.02569580078125, "learning_rate": 3.8100636839701594e-07, "loss": 0.098, "num_tokens": 1717879510.0, "reward": 2.2974331378936768, "reward_std": 0.3995503783226013, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1477556824684143, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 895.9397583007812, "completions/mean_terminated_length": 747.9420166015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.660806563315753, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12471956882233191, "kl": 0.0284423828125, "learning_rate": 3.806961522057087e-07, "loss": 0.0318, "num_tokens": 1718358507.0, "reward": 2.4877233505249023, "reward_std": 0.3503088355064392, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.0852610394358635, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 990.700927734375, "completions/mean_terminated_length": 804.7716674804688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.661019657983059, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13476767129483264, "kl": 0.0267333984375, "learning_rate": 3.80386029711122e-07, "loss": 0.0793, "num_tokens": 1718873893.0, "reward": 2.3716518878936768, "reward_std": 0.487751841545105, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15208269655704498, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1101.450927734375, "completions/mean_terminated_length": 846.7138671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.661232752650365, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1135335624883966, "kl": 0.022247314453125, "learning_rate": 3.8007600108488503e-07, "loss": 0.1092, "num_tokens": 1719446543.0, "reward": 2.310267925262451, "reward_std": 0.41257601976394653, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.49405214190483093, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17468811571598053, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1042.009033203125, "completions/mean_terminated_length": 809.857177734375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6614458473176709, "frac_reward_zero_std": 0.0, "grad_norm": 0.14557508118414644, "kl": 0.026214599609375, "learning_rate": 3.797660664985748e-07, "loss": 0.142, "num_tokens": 1719984963.0, "reward": 2.41796875, "reward_std": 0.5324880480766296, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 931.9710083007812, "completions/mean_terminated_length": 762.7017822265625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6616589419849769, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13670827050044176, "kl": 0.030853271484375, "learning_rate": 3.794562261237164e-07, "loss": 0.0901, "num_tokens": 1720472278.0, "reward": 2.333705425262451, "reward_std": 0.4587516784667969, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067808747291565, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1406535655260086, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1215.4241943359375, "completions/mean_terminated_length": 927.89794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6618720366522828, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11510770536061836, "kl": 0.02447509765625, "learning_rate": 3.7914648013178297e-07, "loss": 0.0829, "num_tokens": 1721088180.0, "reward": 2.2393975257873535, "reward_std": 0.4575740098953247, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1611565798521042, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 852.872802734375, "completions/mean_terminated_length": 712.7955322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6620851313195887, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14223276586948388, "kl": 0.03076171875, "learning_rate": 3.7883682869419507e-07, "loss": 0.0674, "num_tokens": 1721540091.0, "reward": 2.578125, "reward_std": 0.42145445942878723, "rewards/accuracy_reward/mean": 0.6785714030265808, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10289046913385391, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1118.921875, "completions/mean_terminated_length": 848.49853515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6622982259868947, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1263835314218856, "kl": 0.022491455078125, "learning_rate": 3.7852727198232104e-07, "loss": 0.0589, "num_tokens": 1722114984.0, "reward": 2.326451063156128, "reward_std": 0.417420357465744, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13903217017650604, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1041.6004638671875, "completions/mean_terminated_length": 792.10302734375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6625113206542006, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13141432484372964, "kl": 0.026580810546875, "learning_rate": 3.782178101674768e-07, "loss": 0.099, "num_tokens": 1722649381.0, "reward": 2.3253350257873535, "reward_std": 0.5206719636917114, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.1875956654548645, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 950.6875610351562, "completions/mean_terminated_length": 797.1195678710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6627244153215066, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11927302853964485, "kl": 0.02825927734375, "learning_rate": 3.7790844342092576e-07, "loss": 0.0947, "num_tokens": 1723144329.0, "reward": 2.5418527126312256, "reward_std": 0.4637902081012726, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12315750867128372, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 962.5535888671875, "completions/mean_terminated_length": 768.3157958984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6629375099888125, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13194249591660048, "kl": 0.028717041015625, "learning_rate": 3.775991719138789e-07, "loss": 0.097, "num_tokens": 1723643361.0, "reward": 2.443080425262451, "reward_std": 0.47731804847717285, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.16172590851783752, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1047.58935546875, "completions/mean_terminated_length": 849.6470947265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6631506046561185, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11170216943161124, "kl": 0.024505615234375, "learning_rate": 3.772899958174942e-07, "loss": 0.0696, "num_tokens": 1724187289.0, "reward": 2.40234375, "reward_std": 0.3776506185531616, "rewards/accuracy_reward/mean": 0.49074074625968933, "rewards/accuracy_reward/std": 0.5004938244819641, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1256263703107834, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1036.234375, "completions/mean_terminated_length": 806.1616821289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6633636993234244, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.7692815477780209, "kl": 0.09539794921875, "learning_rate": 3.7698091530287703e-07, "loss": 0.107, "num_tokens": 1724729058.0, "reward": 2.40234375, "reward_std": 0.44508540630340576, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1518804132938385, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 997.1428833007812, "completions/mean_terminated_length": 782.4515991210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6635767939907303, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13817653175526767, "kl": 0.02783203125, "learning_rate": 3.7667193054107984e-07, "loss": 0.0778, "num_tokens": 1725243074.0, "reward": 2.4369421005249023, "reward_std": 0.45300424098968506, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13099703192710876, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 976.513427734375, "completions/mean_terminated_length": 761.0670776367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6637898886580363, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13151625241229475, "kl": 0.029815673828125, "learning_rate": 3.7636304170310176e-07, "loss": 0.0814, "num_tokens": 1725746248.0, "reward": 2.467076063156128, "reward_std": 0.46433356404304504, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9425223469734192, "rewards/tag_count_reward/std": 0.19041606783866882, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1006.0223388671875, "completions/mean_terminated_length": 819.5631713867188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6640029833253422, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12057276742612635, "kl": 0.026092529296875, "learning_rate": 3.760542489598896e-07, "loss": 0.0552, "num_tokens": 1726264674.0, "reward": 2.482142925262451, "reward_std": 0.40842026472091675, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12040118128061295, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1057.890625, "completions/mean_terminated_length": 802.0196533203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6642160779926483, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13355482734526317, "kl": 0.025726318359375, "learning_rate": 3.7574555248233574e-07, "loss": 0.1035, "num_tokens": 1726804641.0, "reward": 2.37109375, "reward_std": 0.4875785708427429, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15586401522159576, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 962.05810546875, "completions/mean_terminated_length": 771.0918579101562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6644291726599542, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12193429456029223, "kl": 0.03070068359375, "learning_rate": 3.75436952441281e-07, "loss": 0.0398, "num_tokens": 1727303659.0, "reward": 2.364955425262451, "reward_std": 0.3756348490715027, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15063102543354034, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 932.1495971679688, "completions/mean_terminated_length": 722.0026245117188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6646422673272602, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12381574597006693, "kl": 0.028594970703125, "learning_rate": 3.7512844900751126e-07, "loss": 0.0559, "num_tokens": 1727792190.0, "reward": 2.4715402126312256, "reward_std": 0.4454174041748047, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 996.52685546875, "completions/mean_terminated_length": 808.3684692382812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6648553619945661, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11977573656996728, "kl": 0.02752685546875, "learning_rate": 3.7482004235175977e-07, "loss": 0.0826, "num_tokens": 1728317242.0, "reward": 2.4151787757873535, "reward_std": 0.4344754219055176, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14140157401561737, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 954.044677734375, "completions/mean_terminated_length": 737.5935668945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6650684566618721, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 1.81741237290172, "kl": 0.03192138671875, "learning_rate": 3.745117326447059e-07, "loss": 0.0658, "num_tokens": 1728827198.0, "reward": 2.4536831378936768, "reward_std": 0.4123454988002777, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1034.575927734375, "completions/mean_terminated_length": 772.6798095703125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.665281551329178, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1325237996618248, "kl": 0.028045654296875, "learning_rate": 3.742035200569755e-07, "loss": 0.1099, "num_tokens": 1729359824.0, "reward": 2.255580425262451, "reward_std": 0.437264621257782, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16713166236877441, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1044.950927734375, "completions/mean_terminated_length": 830.2059936523438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6654946459964839, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.2223081589402521, "kl": 0.025543212890625, "learning_rate": 3.738954047591407e-07, "loss": 0.0685, "num_tokens": 1729899290.0, "reward": 2.404576063156128, "reward_std": 0.3951433598995209, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1395251452922821, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 965.4910888671875, "completions/mean_terminated_length": 788.3532104492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6657077406637899, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12772861171746103, "kl": 0.02581787109375, "learning_rate": 3.7358738692171965e-07, "loss": 0.0836, "num_tokens": 1730401302.0, "reward": 2.458705425262451, "reward_std": 0.5159845352172852, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17990919947624207, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1069.12060546875, "completions/mean_terminated_length": 833.2132568359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6659208353310958, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12351318564293766, "kl": 0.025970458984375, "learning_rate": 3.7327946671517685e-07, "loss": 0.0618, "num_tokens": 1730950252.0, "reward": 2.4017858505249023, "reward_std": 0.4600284993648529, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1756715625524521, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 958.3192138671875, "completions/mean_terminated_length": 770.0497436523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6661339299984018, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1201213837583874, "kl": 0.030059814453125, "learning_rate": 3.7297164430992244e-07, "loss": 0.0512, "num_tokens": 1731442603.0, "reward": 2.4229912757873535, "reward_std": 0.43238067626953125, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13915446400642395, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 940.5022583007812, "completions/mean_terminated_length": 762.6139526367188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6663470246657077, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1322184789702175, "kl": 0.027862548828125, "learning_rate": 3.726639198763124e-07, "loss": 0.0685, "num_tokens": 1731931228.0, "reward": 2.44140625, "reward_std": 0.37991076707839966, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.1057254895567894, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1057.779052734375, "completions/mean_terminated_length": 825.9091186523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6665601193330137, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12688346931750136, "kl": 0.025360107421875, "learning_rate": 3.723562935846489e-07, "loss": 0.066, "num_tokens": 1732472361.0, "reward": 2.3515625, "reward_std": 0.47722697257995605, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.1648755669593811, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 985.1563110351562, "completions/mean_terminated_length": 788.3333129882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6667732140003196, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14010179889831353, "kl": 0.02728271484375, "learning_rate": 3.720487656051793e-07, "loss": 0.0995, "num_tokens": 1732989583.0, "reward": 2.4229912757873535, "reward_std": 0.48437464237213135, "rewards/accuracy_reward/mean": 0.5717592835426331, "rewards/accuracy_reward/std": 0.49539753794670105, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15964375436306, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1052.399658203125, "completions/mean_terminated_length": 871.1425170898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6669863086676255, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1249553346825921, "kl": 0.025146484375, "learning_rate": 3.71741336108097e-07, "loss": 0.0983, "num_tokens": 1733530466.0, "reward": 2.5011162757873535, "reward_std": 0.43405023217201233, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13810986280441284, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 983.872802734375, "completions/mean_terminated_length": 806.5182495117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6671994033349316, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1252057133838651, "kl": 0.027740478515625, "learning_rate": 3.7143400526354065e-07, "loss": 0.0961, "num_tokens": 1734036121.0, "reward": 2.501674175262451, "reward_std": 0.4200673997402191, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16628827154636383, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1085.8192138671875, "completions/mean_terminated_length": 867.0219116210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6674124980022375, "frac_reward_zero_std": 0.0, "grad_norm": 0.12501167367196567, "kl": 0.026641845703125, "learning_rate": 3.7112677324159424e-07, "loss": 0.0421, "num_tokens": 1734597944.0, "reward": 2.39453125, "reward_std": 0.5232722759246826, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16374634206295013, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1002.966552734375, "completions/mean_terminated_length": 812.7097778320312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6676255926695435, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12504874351742548, "kl": 0.02734375, "learning_rate": 3.708196402122875e-07, "loss": 0.0932, "num_tokens": 1735126825.0, "reward": 2.407924175262451, "reward_std": 0.39183828234672546, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12651757895946503, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 970.0201416015625, "completions/mean_terminated_length": 753.2681274414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6678386873368494, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14446529017385498, "kl": 0.028900146484375, "learning_rate": 3.7051260634559445e-07, "loss": 0.0893, "num_tokens": 1735629394.0, "reward": 2.4603796005249023, "reward_std": 0.40502676367759705, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12450841069221497, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1057.71875, "completions/mean_terminated_length": 794.7626953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6680517820041554, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1137649200213167, "kl": 0.02490234375, "learning_rate": 3.702056718114355e-07, "loss": 0.0343, "num_tokens": 1736174900.0, "reward": 2.4291296005249023, "reward_std": 0.3963313400745392, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14052370190620422, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1019.2991333007812, "completions/mean_terminated_length": 735.0142211914062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6682648766714613, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13589099142745445, "kl": 0.0264892578125, "learning_rate": 3.6989883677967483e-07, "loss": 0.0849, "num_tokens": 1736701914.0, "reward": 2.2935268878936768, "reward_std": 0.4701041281223297, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17498444020748138, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1011.7902221679688, "completions/mean_terminated_length": 816.6419067382812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6684779713387673, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12305038708272101, "kl": 0.02642822265625, "learning_rate": 3.6959210142012274e-07, "loss": 0.0551, "num_tokens": 1737222876.0, "reward": 2.4609375, "reward_std": 0.4333146810531616, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15571676194667816, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 939.62060546875, "completions/mean_terminated_length": 709.5795288085938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6686910660060732, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11888697442220018, "kl": 0.029144287109375, "learning_rate": 3.692854659025334e-07, "loss": 0.0698, "num_tokens": 1737706978.0, "reward": 2.4927456378936768, "reward_std": 0.4107138514518738, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416090607643127, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 966.24560546875, "completions/mean_terminated_length": 755.6640014648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6689041606733791, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11980726782629948, "kl": 0.02886962890625, "learning_rate": 3.6897893039660597e-07, "loss": 0.075, "num_tokens": 1738204160.0, "reward": 2.3950893878936768, "reward_std": 0.40266507863998413, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 933.0335083007812, "completions/mean_terminated_length": 750.5844116210938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6691172553406851, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13563677144536043, "kl": 0.03057861328125, "learning_rate": 3.6867249507198486e-07, "loss": 0.1168, "num_tokens": 1738681951.0, "reward": 2.4073662757873535, "reward_std": 0.4084078073501587, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1542993038892746, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 883.80810546875, "completions/mean_terminated_length": 710.6718139648438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.669330350007991, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.24132016109008478, "kl": 0.03216552734375, "learning_rate": 3.6836616009825805e-07, "loss": 0.0461, "num_tokens": 1739153625.0, "reward": 2.572544813156128, "reward_std": 0.3973178565502167, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.2463276982307434, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13033421337604523, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 848.0223388671875, "completions/mean_terminated_length": 733.5990600585938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.669543444675297, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13881914498148673, "kl": 0.0347900390625, "learning_rate": 3.680599256449589e-07, "loss": 0.0371, "num_tokens": 1739608083.0, "reward": 2.4988839626312256, "reward_std": 0.3701789081096649, "rewards/accuracy_reward/mean": 0.5763888955116272, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174960792064667, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.09495897591114044, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 967.4107666015625, "completions/mean_terminated_length": 770.6807861328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6697565393426029, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1563214311190711, "kl": 0.02960205078125, "learning_rate": 3.677537918815646e-07, "loss": 0.0865, "num_tokens": 1740113691.0, "reward": 2.3521206378936768, "reward_std": 0.4870644509792328, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.14974473416805267, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 980.6339721679688, "completions/mean_terminated_length": 769.5160522460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6699696340099089, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12406385365521579, "kl": 0.02862548828125, "learning_rate": 3.674477589774966e-07, "loss": 0.0743, "num_tokens": 1740628919.0, "reward": 2.3872768878936768, "reward_std": 0.36976245045661926, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13763901591300964, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1048.821533203125, "completions/mean_terminated_length": 854.3146362304688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6701827286772148, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12198801427599272, "kl": 0.023773193359375, "learning_rate": 3.6714182710212094e-07, "loss": 0.046, "num_tokens": 1741180087.0, "reward": 2.450892925262451, "reward_std": 0.4436436593532562, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11162012070417404, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 962.7120971679688, "completions/mean_terminated_length": 708.581298828125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6703958233445209, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1450248931589689, "kl": 0.029296875, "learning_rate": 3.6683599642474716e-07, "loss": 0.1163, "num_tokens": 1741679782.0, "reward": 2.3521206378936768, "reward_std": 0.5184926390647888, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16142748296260834, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 895.029052734375, "completions/mean_terminated_length": 759.892822265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6706089180118268, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1474702976756312, "kl": 0.03076171875, "learning_rate": 3.6653026711462966e-07, "loss": 0.1044, "num_tokens": 1742148227.0, "reward": 2.4375, "reward_std": 0.47774118185043335, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.152545765042305, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1013.05810546875, "completions/mean_terminated_length": 798.2587280273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6708220126791327, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12896039261732112, "kl": 0.02606201171875, "learning_rate": 3.662246393409657e-07, "loss": 0.0854, "num_tokens": 1742668861.0, "reward": 2.3744421005249023, "reward_std": 0.38497596979141235, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1095.919677734375, "completions/mean_terminated_length": 825.8452758789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6710351073464387, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12341173822859448, "kl": 0.024566650390625, "learning_rate": 3.65919113272897e-07, "loss": 0.1024, "num_tokens": 1743230521.0, "reward": 2.2862725257873535, "reward_std": 0.48895934224128723, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.170974463224411, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1039.3125, "completions/mean_terminated_length": 789.2479248046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6712482020137446, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12922827712692214, "kl": 0.025390625, "learning_rate": 3.656136890795092e-07, "loss": 0.0878, "num_tokens": 1743770037.0, "reward": 2.3666296005249023, "reward_std": 0.43791985511779785, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14201714098453522, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1064.779052734375, "completions/mean_terminated_length": 803.69775390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6714612966810506, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13503756607025763, "kl": 0.026947021484375, "learning_rate": 3.6530836692983056e-07, "loss": 0.0639, "num_tokens": 1744310994.0, "reward": 2.390625, "reward_std": 0.40812820196151733, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14816173911094666, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 890.3906860351562, "completions/mean_terminated_length": 751.4774780273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6716743913483565, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1255107174827235, "kl": 0.030853271484375, "learning_rate": 3.650031469928342e-07, "loss": 0.0595, "num_tokens": 1744779201.0, "reward": 2.4620537757873535, "reward_std": 0.43585923314094543, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1457662582397461, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1068.477783203125, "completions/mean_terminated_length": 858.7696533203125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.6718874860156625, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11058912215797607, "kl": 0.02569580078125, "learning_rate": 3.646980294374354e-07, "loss": 0.0462, "num_tokens": 1745332199.0, "reward": 2.4810268878936768, "reward_std": 0.40336552262306213, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10171286761760712, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1075.341552734375, "completions/mean_terminated_length": 840.9334716796875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6721005806829684, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11722160299189707, "kl": 0.025604248046875, "learning_rate": 3.6439301443249393e-07, "loss": 0.1001, "num_tokens": 1745879824.0, "reward": 2.3699777126312256, "reward_std": 0.5322286486625671, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.1867084950208664, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 974.0178833007812, "completions/mean_terminated_length": 736.9808959960938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6723136753502743, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14085428645184883, "kl": 0.028076171875, "learning_rate": 3.6408810214681185e-07, "loss": 0.0627, "num_tokens": 1746383064.0, "reward": 2.408482313156128, "reward_std": 0.3791973292827606, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 988.1094360351562, "completions/mean_terminated_length": 788.5013427734375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6725267700175803, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14533547741062058, "kl": 0.0264892578125, "learning_rate": 3.6378329274913475e-07, "loss": 0.0914, "num_tokens": 1746892025.0, "reward": 2.424107313156128, "reward_std": 0.43542560935020447, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 969.69873046875, "completions/mean_terminated_length": 783.3953247070312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6727398646848862, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13387323037041368, "kl": 0.02618408203125, "learning_rate": 3.6347858640815175e-07, "loss": 0.1273, "num_tokens": 1747393426.0, "reward": 2.4447546005249023, "reward_std": 0.4634547531604767, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14248228073120117, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 901.4464721679688, "completions/mean_terminated_length": 767.0623779296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6729529593521922, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14417524839413087, "kl": 0.029449462890625, "learning_rate": 3.631739832924938e-07, "loss": 0.0881, "num_tokens": 1747864442.0, "reward": 2.4737725257873535, "reward_std": 0.40823379158973694, "rewards/accuracy_reward/mean": 0.5925925970077515, "rewards/accuracy_reward/std": 0.49192148447036743, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14827017486095428, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1046.4754638671875, "completions/mean_terminated_length": 835.34326171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6731660540194981, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12140019305384736, "kl": 0.0250244140625, "learning_rate": 3.62869483570736e-07, "loss": 0.0704, "num_tokens": 1748403871.0, "reward": 2.3136162757873535, "reward_std": 0.4314676821231842, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10711704939603806, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 907.0089721679688, "completions/mean_terminated_length": 723.7409057617188, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6733791486868042, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.135318506368615, "kl": 0.029510498046875, "learning_rate": 3.6256508741139555e-07, "loss": 0.0715, "num_tokens": 1748881683.0, "reward": 2.4832589626312256, "reward_std": 0.45923757553100586, "rewards/accuracy_reward/mean": 0.5972222089767456, "rewards/accuracy_reward/std": 0.4910254180431366, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14886364340782166, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1196.1138916015625, "completions/mean_terminated_length": 894.9939575195312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.67359224335411, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13121319126883563, "kl": 0.02392578125, "learning_rate": 3.6226079498293206e-07, "loss": 0.116, "num_tokens": 1749484054.0, "reward": 2.1941964626312256, "reward_std": 0.5945513844490051, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.8660714030265808, "rewards/format_reward/std": 0.34095630049705505, "rewards/tag_count_reward/mean": 0.9352678656578064, "rewards/tag_count_reward/std": 0.2121729850769043, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1026.76123046875, "completions/mean_terminated_length": 804.7527465820312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6738053380214161, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15588622314227032, "kl": 0.026519775390625, "learning_rate": 3.6195660645374837e-07, "loss": 0.1051, "num_tokens": 1750014267.0, "reward": 2.3409600257873535, "reward_std": 0.48787736892700195, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15030226111412048, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 945.46435546875, "completions/mean_terminated_length": 754.9738159179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.674018432688722, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13211332699650077, "kl": 0.027587890625, "learning_rate": 3.6165252199218967e-07, "loss": 0.0843, "num_tokens": 1750516347.0, "reward": 2.4927456378936768, "reward_std": 0.39208781719207764, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.11001905798912048, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1072.0826416015625, "completions/mean_terminated_length": 830.1420288085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6742315273560279, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14208125442396127, "kl": 0.024261474609375, "learning_rate": 3.6134854176654316e-07, "loss": 0.116, "num_tokens": 1751065664.0, "reward": 2.3621652126312256, "reward_std": 0.48551085591316223, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17739619314670563, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 996.4442138671875, "completions/mean_terminated_length": 801.7116088867188, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6744446220233339, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1304505816535144, "kl": 0.027862548828125, "learning_rate": 3.6104466594503867e-07, "loss": 0.0629, "num_tokens": 1751578791.0, "reward": 2.4732143878936768, "reward_std": 0.45345214009284973, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1540280133485794, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 958.94873046875, "completions/mean_terminated_length": 764.0657958984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6746577166906398, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13166762557239953, "kl": 0.02728271484375, "learning_rate": 3.607408946958486e-07, "loss": 0.0758, "num_tokens": 1752087312.0, "reward": 2.55859375, "reward_std": 0.381770521402359, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9871651530265808, "rewards/tag_count_reward/std": 0.08346119523048401, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1098.232177734375, "completions/mean_terminated_length": 888.6103515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6748708113579458, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1878031091802857, "kl": 0.026275634765625, "learning_rate": 3.6043722818708646e-07, "loss": 0.0801, "num_tokens": 1752657720.0, "reward": 2.375, "reward_std": 0.439854234457016, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14721500873565674, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 957.716552734375, "completions/mean_terminated_length": 755.8121337890625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6750839060252517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13998441277369844, "kl": 0.0269775390625, "learning_rate": 3.6013366658680864e-07, "loss": 0.1062, "num_tokens": 1753149769.0, "reward": 2.4029018878936768, "reward_std": 0.4386986792087555, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005797147750854, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585102021694183, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1056.57373046875, "completions/mean_terminated_length": 821.0414428710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6752970006925577, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14049653958372124, "kl": 0.0272216796875, "learning_rate": 3.598302100630135e-07, "loss": 0.1148, "num_tokens": 1753686682.0, "reward": 2.4146206378936768, "reward_std": 0.4072682559490204, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.14072787761688232, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 937.9420166015625, "completions/mean_terminated_length": 766.2835083007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6755100953598636, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1245964897125346, "kl": 0.028106689453125, "learning_rate": 3.59526858783641e-07, "loss": 0.0587, "num_tokens": 1754174752.0, "reward": 2.4246652126312256, "reward_std": 0.47704753279685974, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14052370190620422, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 1004.513427734375, "completions/mean_terminated_length": 811.2750854492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6757231900271695, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12159456367258138, "kl": 0.027984619140625, "learning_rate": 3.5922361291657243e-07, "loss": 0.054, "num_tokens": 1754702022.0, "reward": 2.349888563156128, "reward_std": 0.36837661266326904, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 931.1964721679688, "completions/mean_terminated_length": 761.8097534179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6759362846944755, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12209497820257305, "kl": 0.030975341796875, "learning_rate": 3.589204726296317e-07, "loss": 0.0491, "num_tokens": 1755187726.0, "reward": 2.4676339626312256, "reward_std": 0.43713292479515076, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13661938905715942, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 924.8281860351562, "completions/mean_terminated_length": 796.3059692382812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6761493793617814, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14193845854589848, "kl": 0.03076171875, "learning_rate": 3.5861743809058296e-07, "loss": 0.104, "num_tokens": 1755668721.0, "reward": 2.5558037757873535, "reward_std": 0.4540950357913971, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119430512189865, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1044.6317138671875, "completions/mean_terminated_length": 829.8184204101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6763624740290874, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12957226237998298, "kl": 0.02691650390625, "learning_rate": 3.5831450946713373e-07, "loss": 0.0948, "num_tokens": 1756207036.0, "reward": 2.3627233505249023, "reward_std": 0.44290974736213684, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.155877023935318, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 969.5469360351562, "completions/mean_terminated_length": 773.205810546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6765755686963933, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11111187066789892, "kl": 0.02490234375, "learning_rate": 3.5801168692693085e-07, "loss": 0.0276, "num_tokens": 1756714913.0, "reward": 2.474888563156128, "reward_std": 0.4251917004585266, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.1125321090221405, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1076.325927734375, "completions/mean_terminated_length": 756.2789306640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6767886633636994, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12143813758747042, "kl": 0.023345947265625, "learning_rate": 3.57708970637564e-07, "loss": 0.0548, "num_tokens": 1757270451.0, "reward": 2.2036831378936768, "reward_std": 0.39231953024864197, "rewards/accuracy_reward/mean": 0.2790178656578064, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1506175547838211, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 941.2701416015625, "completions/mean_terminated_length": 736.320068359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6770017580310053, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1363581152598531, "kl": 0.030059814453125, "learning_rate": 3.574063607665633e-07, "loss": 0.1031, "num_tokens": 1757755084.0, "reward": 2.5167412757873535, "reward_std": 0.522887647151947, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17114688456058502, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 909.9420166015625, "completions/mean_terminated_length": 766.9698486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6772148526983113, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12932242145500158, "kl": 0.03070068359375, "learning_rate": 3.5710385748140006e-07, "loss": 0.1044, "num_tokens": 1758233634.0, "reward": 2.572544813156128, "reward_std": 0.4777713119983673, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137671947479, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1717585176229477, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 989.3995971679688, "completions/mean_terminated_length": 809.7415161132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6774279473656172, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12157252632002181, "kl": 0.02471923828125, "learning_rate": 3.568014609494867e-07, "loss": 0.0454, "num_tokens": 1758748789.0, "reward": 2.4073662757873535, "reward_std": 0.4419124126434326, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.14308229088783264, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 901.154052734375, "completions/mean_terminated_length": 727.2108154296875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6776410420329231, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.131107924792799, "kl": 0.031494140625, "learning_rate": 3.5649917133817653e-07, "loss": 0.0688, "num_tokens": 1759219274.0, "reward": 2.4419643878936768, "reward_std": 0.4218176603317261, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 965.7120971679688, "completions/mean_terminated_length": 755.0266723632812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6778541367002291, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13279159014723266, "kl": 0.025054931640625, "learning_rate": 3.56196988814764e-07, "loss": 0.065, "num_tokens": 1759720745.0, "reward": 2.3839287757873535, "reward_std": 0.38550058007240295, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10779669135808945, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 958.7410888671875, "completions/mean_terminated_length": 780.4986572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.678067231367535, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12068206797649661, "kl": 0.028717041015625, "learning_rate": 3.558949135464837e-07, "loss": 0.099, "num_tokens": 1760216677.0, "reward": 2.3565850257873535, "reward_std": 0.41873735189437866, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1091761365532875, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1044.07373046875, "completions/mean_terminated_length": 808.9945068359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.678280326034841, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13654134308314955, "kl": 0.02484130859375, "learning_rate": 3.5559294570051135e-07, "loss": 0.0614, "num_tokens": 1760757078.0, "reward": 2.3113839626312256, "reward_std": 0.3459184765815735, "rewards/accuracy_reward/mean": 0.4027777910232544, "rewards/accuracy_reward/std": 0.4910254180431366, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11632467061281204, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 875.8482666015625, "completions/mean_terminated_length": 738.4638671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6784934207021469, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.3916316059081557, "kl": 0.0340576171875, "learning_rate": 3.55291085443963e-07, "loss": 0.0822, "num_tokens": 1761227714.0, "reward": 2.509486675262451, "reward_std": 0.3954867720603943, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1477556824684143, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 986.4910888671875, "completions/mean_terminated_length": 741.5274658203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6787065153694529, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.18603528859447718, "kl": 0.02691650390625, "learning_rate": 3.549893329438951e-07, "loss": 0.0892, "num_tokens": 1761742926.0, "reward": 2.3856027126312256, "reward_std": 0.39618343114852905, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11338312178850174, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1019.46435546875, "completions/mean_terminated_length": 789.0272827148438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6789196100367588, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13418443469941863, "kl": 0.02874755859375, "learning_rate": 3.5468768836730465e-07, "loss": 0.0638, "num_tokens": 1762261694.0, "reward": 2.37109375, "reward_std": 0.3599708378314972, "rewards/accuracy_reward/mean": 0.4722222089767456, "rewards/accuracy_reward/std": 0.49980661273002625, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12338031083345413, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 989.40185546875, "completions/mean_terminated_length": 806.5026245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6791327047040647, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13494690325357142, "kl": 0.026580810546875, "learning_rate": 3.543861518811286e-07, "loss": 0.0688, "num_tokens": 1762775586.0, "reward": 2.4497768878936768, "reward_std": 0.45355990529060364, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709373772144318, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 951.2567138671875, "completions/mean_terminated_length": 727.1908569335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6793457993713707, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12377140203959655, "kl": 0.026611328125, "learning_rate": 3.5408472365224474e-07, "loss": 0.0851, "num_tokens": 1763270677.0, "reward": 2.3934152126312256, "reward_std": 0.36327415704727173, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12892211973667145, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 827.3080444335938, "completions/mean_terminated_length": 701.029541015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6795588940386766, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12787719298349856, "kl": 0.03350830078125, "learning_rate": 3.5378340384747027e-07, "loss": 0.0395, "num_tokens": 1763711391.0, "reward": 2.65234375, "reward_std": 0.3463893234729767, "rewards/accuracy_reward/mean": 0.7209821343421936, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10092891752719879, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 925.9397583007812, "completions/mean_terminated_length": 749.0775146484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6797719887059827, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.146702027068851, "kl": 0.030303955078125, "learning_rate": 3.534821926335627e-07, "loss": 0.1066, "num_tokens": 1764191220.0, "reward": 2.5050225257873535, "reward_std": 0.41286221146583557, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15423759818077087, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 971.38623046875, "completions/mean_terminated_length": 772.01318359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6799850833732886, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14519756827273508, "kl": 0.027008056640625, "learning_rate": 3.5318109017721933e-07, "loss": 0.0547, "num_tokens": 1764696881.0, "reward": 2.5145089626312256, "reward_std": 0.3931633532047272, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1312885582447052, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 885.2969360351562, "completions/mean_terminated_length": 708.9486083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6801981780405946, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.22048546292659177, "kl": 0.063262939453125, "learning_rate": 3.528800966450771e-07, "loss": 0.1137, "num_tokens": 1765163686.0, "reward": 2.509486675262451, "reward_std": 0.37478265166282654, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09911917895078659, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1027.805908203125, "completions/mean_terminated_length": 812.7378540039062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6804112727079005, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12321496379054767, "kl": 0.0252685546875, "learning_rate": 3.5257921220371365e-07, "loss": 0.0804, "num_tokens": 1765692143.0, "reward": 2.443638563156128, "reward_std": 0.39287397265434265, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780035495758057, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 987.1116333007812, "completions/mean_terminated_length": 763.4649047851562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6806243673752065, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12586450994243412, "kl": 0.02716064453125, "learning_rate": 3.522784370196444e-07, "loss": 0.0677, "num_tokens": 1766204465.0, "reward": 2.4732143878936768, "reward_std": 0.390165776014328, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383605539798737, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 950.8951416015625, "completions/mean_terminated_length": 781.2396850585938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6808374620425124, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13326212776539978, "kl": 0.0296630859375, "learning_rate": 3.5197777125932636e-07, "loss": 0.0837, "num_tokens": 1766694898.0, "reward": 2.4771206378936768, "reward_std": 0.3968465030193329, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15578390657901764, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1104.6920166015625, "completions/mean_terminated_length": 847.4261474609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6810505567098183, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11076158700780805, "kl": 0.023529052734375, "learning_rate": 3.516772150891545e-07, "loss": 0.1215, "num_tokens": 1767261656.0, "reward": 2.342076063156128, "reward_std": 0.5290426015853882, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16933098435401917, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1049.5067138671875, "completions/mean_terminated_length": 861.4615478515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6812636513771243, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15210716919171763, "kl": 0.02923583984375, "learning_rate": 3.513767686754638e-07, "loss": 0.0655, "num_tokens": 1767798539.0, "reward": 2.4107143878936768, "reward_std": 0.4396968483924866, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1040.5201416015625, "completions/mean_terminated_length": 728.26025390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6814767460444302, "frac_reward_zero_std": 0.25, "grad_norm": 0.12789386381105736, "kl": 0.025634765625, "learning_rate": 3.510764321845283e-07, "loss": 0.0746, "num_tokens": 1768332628.0, "reward": 2.3392858505249023, "reward_std": 0.3575405776500702, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 927.6317138671875, "completions/mean_terminated_length": 730.611572265625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6816898407117362, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13260031680391898, "kl": 0.02862548828125, "learning_rate": 3.5077620578256116e-07, "loss": 0.0483, "num_tokens": 1768820783.0, "reward": 2.423549175262451, "reward_std": 0.43929746747016907, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.17819663882255554, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1029.430908203125, "completions/mean_terminated_length": 827.895751953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6819029353790421, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11829040646104894, "kl": 0.0252685546875, "learning_rate": 3.5047608963571517e-07, "loss": 0.0304, "num_tokens": 1769352944.0, "reward": 2.4051339626312256, "reward_std": 0.35791248083114624, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11345604062080383, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1051.4732666015625, "completions/mean_terminated_length": 834.8369750976562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6821160300463481, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11946690477040675, "kl": 0.0252685546875, "learning_rate": 3.501760839100809e-07, "loss": 0.0345, "num_tokens": 1769893252.0, "reward": 2.4034600257873535, "reward_std": 0.4048386812210083, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1037.009033203125, "completions/mean_terminated_length": 827.1806030273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.682329124713654, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.126176189986573, "kl": 0.0262451171875, "learning_rate": 3.498761887716892e-07, "loss": 0.0803, "num_tokens": 1770426136.0, "reward": 2.3582589626312256, "reward_std": 0.4287581741809845, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.15854518115520477, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 937.2031860351562, "completions/mean_terminated_length": 765.4303588867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.68254221938096, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12766465159262708, "kl": 0.02813720703125, "learning_rate": 3.495764043865088e-07, "loss": 0.0819, "num_tokens": 1770915731.0, "reward": 2.4933037757873535, "reward_std": 0.3616020083427429, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.1090860590338707, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1004.8147583007812, "completions/mean_terminated_length": 814.8944702148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.682755314048266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12618881296051956, "kl": 0.026397705078125, "learning_rate": 3.4927673092044753e-07, "loss": 0.0692, "num_tokens": 1771436288.0, "reward": 2.575892925262451, "reward_std": 0.4124738276004791, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11611521244049072, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 902.49560546875, "completions/mean_terminated_length": 728.7557983398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6829684087155719, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12493146058500157, "kl": 0.03143310546875, "learning_rate": 3.4897716853935166e-07, "loss": 0.0256, "num_tokens": 1771913310.0, "reward": 2.626674175262451, "reward_std": 0.37074634432792664, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45739173889160156, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1293378323316574, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 968.1741333007812, "completions/mean_terminated_length": 807.5846557617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6831815033828779, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12342937569767037, "kl": 0.026214599609375, "learning_rate": 3.4867771740900574e-07, "loss": 0.0733, "num_tokens": 1772417052.0, "reward": 2.5396206378936768, "reward_std": 0.41366440057754517, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10744724422693253, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 919.6160888671875, "completions/mean_terminated_length": 761.6997680664062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6833945980501838, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12483192138286489, "kl": 0.030303955078125, "learning_rate": 3.4837837769513356e-07, "loss": 0.0759, "num_tokens": 1772897728.0, "reward": 2.4921875, "reward_std": 0.4152313768863678, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14495466649532318, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 899.44873046875, "completions/mean_terminated_length": 755.1582641601562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6836076927174898, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12199326708025983, "kl": 0.0306396484375, "learning_rate": 3.4807914956339667e-07, "loss": 0.1006, "num_tokens": 1773370281.0, "reward": 2.5167412757873535, "reward_std": 0.45422816276550293, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1361067146062851, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1012.6719360351562, "completions/mean_terminated_length": 801.1531982421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6838207873847957, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.21942022406132486, "kl": 0.02984619140625, "learning_rate": 3.477800331793944e-07, "loss": 0.0623, "num_tokens": 1773901334.0, "reward": 2.4324777126312256, "reward_std": 0.4031684994697571, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1276179552078247, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 898.8147583007812, "completions/mean_terminated_length": 767.31591796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6840338820521017, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12885378471931858, "kl": 0.032135009765625, "learning_rate": 3.4748102870866536e-07, "loss": 0.0898, "num_tokens": 1774365251.0, "reward": 2.529576063156128, "reward_std": 0.4196571707725525, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681798309087753, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1020.7500610351562, "completions/mean_terminated_length": 807.5471801757812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6842469767194076, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1366695324226509, "kl": 0.029052734375, "learning_rate": 3.471821363166854e-07, "loss": 0.0811, "num_tokens": 1774890339.0, "reward": 2.446986675262451, "reward_std": 0.3788634240627289, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.1770157665014267, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1157.578125, "completions/mean_terminated_length": 885.0000610351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6844600713867135, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15311054874881494, "kl": 0.02508544921875, "learning_rate": 3.4688335616886866e-07, "loss": 0.0879, "num_tokens": 1775488022.0, "reward": 2.3275671005249023, "reward_std": 0.4805065393447876, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1640053689479828, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 812.6451416015625, "completions/mean_terminated_length": 707.9540405273438, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6846731660540195, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14260398135626523, "kl": 0.032989501953125, "learning_rate": 3.465846884305668e-07, "loss": 0.0861, "num_tokens": 1775916663.0, "reward": 2.5708706378936768, "reward_std": 0.45875486731529236, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45011183619499207, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.14974473416805267, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1144.140625, "completions/mean_terminated_length": 867.4490356445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6848862607213254, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12313410358365717, "kl": 0.024871826171875, "learning_rate": 3.462861332670699e-07, "loss": 0.0492, "num_tokens": 1776500470.0, "reward": 2.416294813156128, "reward_std": 0.4591118395328522, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 942.4420166015625, "completions/mean_terminated_length": 794.1012573242188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6850993553886314, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11684252435939976, "kl": 0.026336669921875, "learning_rate": 3.4598769084360535e-07, "loss": 0.0344, "num_tokens": 1776993468.0, "reward": 2.4168527126312256, "reward_std": 0.3923453688621521, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12359261512756348, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 842.3795166015625, "completions/mean_terminated_length": 717.6600952148438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6853124500559373, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14330222678984342, "kl": 0.033660888671875, "learning_rate": 3.456893613253381e-07, "loss": 0.0646, "num_tokens": 1777442342.0, "reward": 2.4810268878936768, "reward_std": 0.47261562943458557, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15391045808792114, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1125.04248046875, "completions/mean_terminated_length": 869.9800415039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6855255447232433, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11719815097562283, "kl": 0.023406982421875, "learning_rate": 3.453911448773707e-07, "loss": 0.0733, "num_tokens": 1778027737.0, "reward": 2.23828125, "reward_std": 0.4467274248600006, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.4732317626476288, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1037.3482666015625, "completions/mean_terminated_length": 779.7311401367188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6857386393905492, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.10334794752639039, "kl": 0.025543212890625, "learning_rate": 3.450930416647429e-07, "loss": 0.0421, "num_tokens": 1778558501.0, "reward": 2.404017925262451, "reward_std": 0.4223562777042389, "rewards/accuracy_reward/mean": 0.5324074029922485, "rewards/accuracy_reward/std": 0.49952712655067444, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1457662582397461, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1056.2366943359375, "completions/mean_terminated_length": 824.0054931640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6859517340578553, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1322004363509126, "kl": 0.02593994140625, "learning_rate": 3.447950518524327e-07, "loss": 0.0785, "num_tokens": 1779095471.0, "reward": 2.39453125, "reward_std": 0.46494632959365845, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1089.634033203125, "completions/mean_terminated_length": 841.96630859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6861648287251612, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12935176748098937, "kl": 0.027313232421875, "learning_rate": 3.44497175605354e-07, "loss": 0.06, "num_tokens": 1779650267.0, "reward": 2.25, "reward_std": 0.4300120174884796, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1523328423500061, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1063.055908203125, "completions/mean_terminated_length": 842.3851928710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6863779233924671, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12782868625535562, "kl": 0.0262451171875, "learning_rate": 3.441994130883584e-07, "loss": 0.091, "num_tokens": 1780195444.0, "reward": 2.3214287757873535, "reward_std": 0.47149014472961426, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15850186347961426, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 857.0313110351562, "completions/mean_terminated_length": 697.2304077148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6865910180597731, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1432209289558248, "kl": 0.03350830078125, "learning_rate": 3.439017644662353e-07, "loss": 0.0625, "num_tokens": 1780647202.0, "reward": 2.587611675262451, "reward_std": 0.35203656554222107, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.09263478219509125, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1007.3147583007812, "completions/mean_terminated_length": 804.7279663085938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.686804112727079, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12949679308296447, "kl": 0.028076171875, "learning_rate": 3.4360422990371006e-07, "loss": 0.0646, "num_tokens": 1781170271.0, "reward": 2.4575893878936768, "reward_std": 0.4232456386089325, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 987.6652221679688, "completions/mean_terminated_length": 797.9210815429688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.687017207394385, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1200548861840904, "kl": 0.02935791015625, "learning_rate": 3.4330680956544535e-07, "loss": 0.049, "num_tokens": 1781687433.0, "reward": 2.4481027126312256, "reward_std": 0.4604281783103943, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13776934146881104, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1062.3348388671875, "completions/mean_terminated_length": 882.8865966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6872303020616909, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12295632693412772, "kl": 0.026611328125, "learning_rate": 3.4300950361604023e-07, "loss": 0.0862, "num_tokens": 1782231391.0, "reward": 2.453125, "reward_std": 0.37440693378448486, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12717820703983307, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 922.1964721679688, "completions/mean_terminated_length": 727.6858520507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6874433967289969, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14504560021625246, "kl": 0.029144287109375, "learning_rate": 3.4271231222003137e-07, "loss": 0.0742, "num_tokens": 1782718631.0, "reward": 2.396763563156128, "reward_std": 0.419214129447937, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13978439569473267, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1099.1473388671875, "completions/mean_terminated_length": 843.7903442382812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6876564913963028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12388396314078683, "kl": 0.025848388671875, "learning_rate": 3.424152355418913e-07, "loss": 0.0875, "num_tokens": 1783277209.0, "reward": 2.506138563156128, "reward_std": 0.49026402831077576, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 942.4553833007812, "completions/mean_terminated_length": 771.4948120117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6878695860636087, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1342325794874321, "kl": 0.02734375, "learning_rate": 3.4211827374602875e-07, "loss": 0.0628, "num_tokens": 1783765813.0, "reward": 2.533482313156128, "reward_std": 0.4199574589729309, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12198749929666519, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 991.90185546875, "completions/mean_terminated_length": 772.7115478515625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6880826807309147, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12903442656014882, "kl": 0.027740478515625, "learning_rate": 3.4182142699678987e-07, "loss": 0.0831, "num_tokens": 1784279401.0, "reward": 2.478794813156128, "reward_std": 0.3849352300167084, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585100531578064, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 930.3839721679688, "completions/mean_terminated_length": 780.42529296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6882957753982206, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11987131534352367, "kl": 0.0283203125, "learning_rate": 3.4152469545845646e-07, "loss": 0.0859, "num_tokens": 1784767653.0, "reward": 2.529576063156128, "reward_std": 0.35763558745384216, "rewards/accuracy_reward/mean": 0.6342592835426331, "rewards/accuracy_reward/std": 0.482195645570755, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09374666959047318, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1047.3973388671875, "completions/mean_terminated_length": 862.1005249023438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6885088700655266, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11484671446180356, "kl": 0.02593994140625, "learning_rate": 3.412280792952467e-07, "loss": 0.0905, "num_tokens": 1785301079.0, "reward": 2.368861675262451, "reward_std": 0.4609025716781616, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14149756729602814, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1138.4241943359375, "completions/mean_terminated_length": 863.43603515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6887219647328325, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 1.9967743485306602, "kl": 0.045989990234375, "learning_rate": 3.4093157867131483e-07, "loss": 0.091, "num_tokens": 1785886837.0, "reward": 2.2801339626312256, "reward_std": 0.4846292734146118, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9408482313156128, "rewards/tag_count_reward/std": 0.20512165129184723, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1032.57373046875, "completions/mean_terminated_length": 828.3994750976562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6889350594001385, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13882914908989316, "kl": 0.02685546875, "learning_rate": 3.4063519375075123e-07, "loss": 0.1173, "num_tokens": 1786413446.0, "reward": 2.5385046005249023, "reward_std": 0.43064308166503906, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561823636293411, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1056.1116943359375, "completions/mean_terminated_length": 803.27734375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.6891481540674445, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12581406935550796, "kl": 0.026611328125, "learning_rate": 3.4033892469758256e-07, "loss": 0.0941, "num_tokens": 1786958120.0, "reward": 2.467076063156128, "reward_std": 0.5067455172538757, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16741061210632324, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1054.4085693359375, "completions/mean_terminated_length": 831.800537109375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.6893612487347505, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1266715869994, "kl": 0.026123046875, "learning_rate": 3.400427716757709e-07, "loss": 0.0994, "num_tokens": 1787496735.0, "reward": 2.3989956378936768, "reward_std": 0.5269179344177246, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.18444819748401642, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 888.77685546875, "completions/mean_terminated_length": 746.416015625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6895743434020564, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13999207710826805, "kl": 0.031097412109375, "learning_rate": 3.3974673484921424e-07, "loss": 0.0757, "num_tokens": 1787963323.0, "reward": 2.4994421005249023, "reward_std": 0.39913371205329895, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14177079498767853, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 988.77685546875, "completions/mean_terminated_length": 840.5394287109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6897874380693623, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1296151895488594, "kl": 0.02618408203125, "learning_rate": 3.394508143817464e-07, "loss": 0.0935, "num_tokens": 1788478071.0, "reward": 2.4252233505249023, "reward_std": 0.38937118649482727, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12956565618515015, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 959.5223388671875, "completions/mean_terminated_length": 774.7937622070312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6900005327366683, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12981929883126386, "kl": 0.02911376953125, "learning_rate": 3.3915501043713653e-07, "loss": 0.0661, "num_tokens": 1788982961.0, "reward": 2.548549175262451, "reward_std": 0.4418761134147644, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.13086353242397308, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 956.4063110351562, "completions/mean_terminated_length": 764.4461669921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6902136274039742, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1337861017809418, "kl": 0.02935791015625, "learning_rate": 3.3885932317908954e-07, "loss": 0.0421, "num_tokens": 1789476887.0, "reward": 2.484375, "reward_std": 0.4053283929824829, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 903.388427734375, "completions/mean_terminated_length": 756.3475952148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6904267220712802, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12671813006467553, "kl": 0.02691650390625, "learning_rate": 3.3856375277124567e-07, "loss": 0.0251, "num_tokens": 1789955605.0, "reward": 2.540736675262451, "reward_std": 0.4351884722709656, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11358113586902618, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 960.2522583007812, "completions/mean_terminated_length": 775.6475219726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6906398167385861, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1382367425193716, "kl": 0.028228759765625, "learning_rate": 3.382682993771807e-07, "loss": 0.09, "num_tokens": 1790459990.0, "reward": 2.419642925262451, "reward_std": 0.4065806567668915, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12842853367328644, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 968.9397583007812, "completions/mean_terminated_length": 798.8553466796875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.6908529114058921, "frac_reward_zero_std": 0.0, "grad_norm": 0.14050551075254888, "kl": 0.028839111328125, "learning_rate": 3.3797296316040533e-07, "loss": 0.1015, "num_tokens": 1790969083.0, "reward": 2.5089287757873535, "reward_std": 0.46475881338119507, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12457982450723648, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1029.7991943359375, "completions/mean_terminated_length": 794.8297119140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.691066006073198, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10930864713073257, "kl": 0.0264892578125, "learning_rate": 3.376777442843656e-07, "loss": 0.0658, "num_tokens": 1791499089.0, "reward": 2.431919813156128, "reward_std": 0.4017297029495239, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.1088741347193718, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1032.54248046875, "completions/mean_terminated_length": 811.790771484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6912791007405039, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1319419259679159, "kl": 0.024505615234375, "learning_rate": 3.373826429124425e-07, "loss": 0.0952, "num_tokens": 1792034740.0, "reward": 2.3761162757873535, "reward_std": 0.42495864629745483, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.146973118185997, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1028.7388916015625, "completions/mean_terminated_length": 800.3797607421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6914921954078099, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13124120719865734, "kl": 0.027130126953125, "learning_rate": 3.370876592079519e-07, "loss": 0.0933, "num_tokens": 1792566735.0, "reward": 2.4676339626312256, "reward_std": 0.3682340383529663, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 975.482177734375, "completions/mean_terminated_length": 752.8840942382812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6917052900751158, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1307720831033583, "kl": 0.0272216796875, "learning_rate": 3.367927933341453e-07, "loss": 0.0733, "num_tokens": 1793072087.0, "reward": 2.478236675262451, "reward_std": 0.36143651604652405, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523807287216, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1032.837158203125, "completions/mean_terminated_length": 831.9759521484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6919183847424218, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13823756019022623, "kl": 0.024658203125, "learning_rate": 3.3649804545420747e-07, "loss": 0.0801, "num_tokens": 1793604430.0, "reward": 2.3950893878936768, "reward_std": 0.5093016624450684, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15850186347961426, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1013.0826416015625, "completions/mean_terminated_length": 827.8869018554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6921314794097277, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12939242129823833, "kl": 0.02642822265625, "learning_rate": 3.3620341573125954e-07, "loss": 0.0818, "num_tokens": 1794131747.0, "reward": 2.3543527126312256, "reward_std": 0.49827271699905396, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9369419813156128, "rewards/tag_count_reward/std": 0.20292110741138458, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 972.919677734375, "completions/mean_terminated_length": 777.192626953125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6923445740770338, "frac_reward_zero_std": 0.0, "grad_norm": 0.13638528758844093, "kl": 0.029876708984375, "learning_rate": 3.359089043283563e-07, "loss": 0.1124, "num_tokens": 1794637071.0, "reward": 2.466517925262451, "reward_std": 0.44474413990974426, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13220298290252686, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 763.6205444335938, "completions/mean_terminated_length": 637.7009887695312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6925576687443397, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15979077152984286, "kl": 0.0335693359375, "learning_rate": 3.3561451140848723e-07, "loss": 0.073, "num_tokens": 1795040165.0, "reward": 2.5379464626312256, "reward_std": 0.39149630069732666, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1008.9598388671875, "completions/mean_terminated_length": 779.6348876953125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6927707634116457, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13844661475026718, "kl": 0.027191162109375, "learning_rate": 3.3532023713457636e-07, "loss": 0.0708, "num_tokens": 1795568115.0, "reward": 2.279017925262451, "reward_std": 0.404011607170105, "rewards/accuracy_reward/mean": 0.40509259700775146, "rewards/accuracy_reward/std": 0.49147912859916687, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15182389318943024, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1026.97998046875, "completions/mean_terminated_length": 791.3599243164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6929838580789516, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13752420654232853, "kl": 0.02484130859375, "learning_rate": 3.350260816694816e-07, "loss": 0.1021, "num_tokens": 1796103690.0, "reward": 2.368861675262451, "reward_std": 0.439092218875885, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14874933660030365, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 965.0402221679688, "completions/mean_terminated_length": 729.6141357421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6931969527462575, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13367002497597577, "kl": 0.029052734375, "learning_rate": 3.347320451759962e-07, "loss": 0.0791, "num_tokens": 1796602748.0, "reward": 2.4771206378936768, "reward_std": 0.3418791890144348, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11898136883974075, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1026.055908203125, "completions/mean_terminated_length": 779.7700805664062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6934100474135635, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13465163305440037, "kl": 0.026123046875, "learning_rate": 3.3443812781684613e-07, "loss": 0.065, "num_tokens": 1797137861.0, "reward": 2.3895089626312256, "reward_std": 0.5091385841369629, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17114688456058502, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1013.6295166015625, "completions/mean_terminated_length": 795.572998046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6936231420808694, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14682133717412618, "kl": 0.0302734375, "learning_rate": 3.341443297546925e-07, "loss": 0.0997, "num_tokens": 1797655999.0, "reward": 2.4754464626312256, "reward_std": 0.4947318434715271, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.1693466603755951, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 970.2745971679688, "completions/mean_terminated_length": 797.1683959960938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6938362367481754, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14476095030009653, "kl": 0.027984619140625, "learning_rate": 3.338506511521301e-07, "loss": 0.105, "num_tokens": 1798165946.0, "reward": 2.3521206378936768, "reward_std": 0.45912423729896545, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 918.482177734375, "completions/mean_terminated_length": 723.329833984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6940493314154813, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12726409027995148, "kl": 0.029693603515625, "learning_rate": 3.335570921716875e-07, "loss": 0.0655, "num_tokens": 1798638306.0, "reward": 2.513951063156128, "reward_std": 0.4008297324180603, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561823636293411, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 825.7902221679688, "completions/mean_terminated_length": 679.125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6942624260827873, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15922719171485278, "kl": 0.030517578125, "learning_rate": 3.33263652975827e-07, "loss": 0.078, "num_tokens": 1799080180.0, "reward": 2.431919813156128, "reward_std": 0.3773408830165863, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11987641453742981, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1050.40185546875, "completions/mean_terminated_length": 774.7122192382812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6944755207500932, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12576416285989886, "kl": 0.026702880859375, "learning_rate": 3.3297033372694473e-07, "loss": 0.1206, "num_tokens": 1799625912.0, "reward": 2.3470983505249023, "reward_std": 0.45343929529190063, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15964375436306, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1016.7902221679688, "completions/mean_terminated_length": 825.8253784179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6946886154173991, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12008035074118424, "kl": 0.025604248046875, "learning_rate": 3.326771345873706e-07, "loss": 0.0842, "num_tokens": 1800148218.0, "reward": 2.364955425262451, "reward_std": 0.4011920690536499, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14979995787143707, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1040.180908203125, "completions/mean_terminated_length": 800.754150390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6949017100847051, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12962964994846102, "kl": 0.028900146484375, "learning_rate": 3.323840557193681e-07, "loss": 0.094, "num_tokens": 1800681083.0, "reward": 2.33984375, "reward_std": 0.526114284992218, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764793753623962, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 964.810302734375, "completions/mean_terminated_length": 736.462158203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.695114804752011, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13680874924568992, "kl": 0.02960205078125, "learning_rate": 3.320910972851333e-07, "loss": 0.125, "num_tokens": 1801186694.0, "reward": 2.3470983505249023, "reward_std": 0.4431781768798828, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13763901591300964, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1009.7656860351562, "completions/mean_terminated_length": 839.8727416992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.695327899419317, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13551004048073556, "kl": 0.02764892578125, "learning_rate": 3.3179825944679683e-07, "loss": 0.1295, "num_tokens": 1801710861.0, "reward": 2.404576063156128, "reward_std": 0.499032586812973, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1518804132938385, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1018.6897583007812, "completions/mean_terminated_length": 805.0592651367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.695540994086623, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1283599951794352, "kl": 0.02813720703125, "learning_rate": 3.315055423664217e-07, "loss": 0.0614, "num_tokens": 1802238402.0, "reward": 2.4090402126312256, "reward_std": 0.4992900490760803, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1369149088859558, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 928.1116333007812, "completions/mean_terminated_length": 761.5641479492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.695754088753929, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1314950921602472, "kl": 0.0323486328125, "learning_rate": 3.312129462060048e-07, "loss": 0.0047, "num_tokens": 1802721044.0, "reward": 2.4603796005249023, "reward_std": 0.35351112484931946, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09374666959047318, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1018.9710083007812, "completions/mean_terminated_length": 778.0137939453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6959671834212349, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11803846573079549, "kl": 0.02752685546875, "learning_rate": 3.3092047112747514e-07, "loss": 0.0638, "num_tokens": 1803248375.0, "reward": 2.443638563156128, "reward_std": 0.38853222131729126, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11970315873622894, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 936.8482666015625, "completions/mean_terminated_length": 778.1122436523438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6961802780885409, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1489322828575107, "kl": 0.028350830078125, "learning_rate": 3.306281172926959e-07, "loss": 0.0931, "num_tokens": 1803735795.0, "reward": 2.509486675262451, "reward_std": 0.4343414306640625, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681798309087753, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 929.4219360351562, "completions/mean_terminated_length": 753.1085205078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6963933727558468, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14209620506468532, "kl": 0.031402587890625, "learning_rate": 3.303358848634621e-07, "loss": 0.1164, "num_tokens": 1804218496.0, "reward": 2.4620537757873535, "reward_std": 0.4689207375049591, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 877.419677734375, "completions/mean_terminated_length": 706.7723999023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6966064674231527, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13174252037452464, "kl": 0.02960205078125, "learning_rate": 3.300437740015022e-07, "loss": 0.1134, "num_tokens": 1804688028.0, "reward": 2.62890625, "reward_std": 0.40082108974456787, "rewards/accuracy_reward/mean": 0.7053571343421936, "rewards/accuracy_reward/std": 0.45639166235923767, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10744724422693253, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 876.2656860351562, "completions/mean_terminated_length": 702.0076904296875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6968195620904587, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13423189162957397, "kl": 0.030487060546875, "learning_rate": 3.2975178486847724e-07, "loss": 0.0876, "num_tokens": 1805160755.0, "reward": 2.4693081378936768, "reward_std": 0.4009111225605011, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13124457001686096, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1074.04248046875, "completions/mean_terminated_length": 804.8860473632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6970326567577646, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14131694558300043, "kl": 0.026397705078125, "learning_rate": 3.2945991762598054e-07, "loss": 0.0731, "num_tokens": 1805707270.0, "reward": 2.23828125, "reward_std": 0.4340845048427582, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.1539783775806427, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 944.7410888671875, "completions/mean_terminated_length": 764.207763671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6972457514250706, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.22698029651585092, "kl": 0.031524658203125, "learning_rate": 3.2916817243553885e-07, "loss": 0.084, "num_tokens": 1806208434.0, "reward": 2.4464287757873535, "reward_std": 0.5017050504684448, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1353386640548706, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 912.4219360351562, "completions/mean_terminated_length": 719.6997680664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6974588460923765, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13584003178857615, "kl": 0.030487060546875, "learning_rate": 3.288765494586104e-07, "loss": 0.0535, "num_tokens": 1806679279.0, "reward": 2.579799175262451, "reward_std": 0.3927724063396454, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1034.571533203125, "completions/mean_terminated_length": 783.3314819335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6976719407596825, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11238609365077733, "kl": 0.02520751953125, "learning_rate": 3.285850488565861e-07, "loss": 0.0366, "num_tokens": 1807212863.0, "reward": 2.3660714626312256, "reward_std": 0.3291527032852173, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.11074430495500565, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 989.779052734375, "completions/mean_terminated_length": 773.5833129882812, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6978850354269884, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13342683138492548, "kl": 0.026092529296875, "learning_rate": 3.282936707907895e-07, "loss": 0.0752, "num_tokens": 1807724876.0, "reward": 2.421875, "reward_std": 0.39623740315437317, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1007.904052734375, "completions/mean_terminated_length": 785.2276611328125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6980981300942944, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12141763965678348, "kl": 0.02813720703125, "learning_rate": 3.28002415422476e-07, "loss": 0.0985, "num_tokens": 1808245169.0, "reward": 2.3526787757873535, "reward_std": 0.436323344707489, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13106490671634674, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1065.654052734375, "completions/mean_terminated_length": 811.789306640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6983112247616003, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.5026496786030623, "kl": 0.026611328125, "learning_rate": 3.27711282912833e-07, "loss": 0.0591, "num_tokens": 1808797814.0, "reward": 2.2901787757873535, "reward_std": 0.4232753813266754, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16370916366577148, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 915.794677734375, "completions/mean_terminated_length": 727.09375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6985243194289062, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12542396628115388, "kl": 0.026763916015625, "learning_rate": 3.2742027342298013e-07, "loss": 0.0094, "num_tokens": 1809282330.0, "reward": 2.4129464626312256, "reward_std": 0.3796856999397278, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9910714030265808, "rewards/tag_count_reward/std": 0.06192811205983162, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 893.7656860351562, "completions/mean_terminated_length": 795.9491577148438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.6987374140962123, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1308948560282255, "kl": 0.030181884765625, "learning_rate": 3.271293871139689e-07, "loss": 0.0513, "num_tokens": 1809749585.0, "reward": 2.541294813156128, "reward_std": 0.4330494701862335, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709372282028198, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 864.2567138671875, "completions/mean_terminated_length": 751.3814697265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6989505087635182, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13624695986172078, "kl": 0.030609130859375, "learning_rate": 3.26838624146783e-07, "loss": 0.0662, "num_tokens": 1810206516.0, "reward": 2.428013563156128, "reward_std": 0.33159682154655457, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10481233894824982, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 968.5558471679688, "completions/mean_terminated_length": 804.83544921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6991636034308242, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1228726696466669, "kl": 0.029266357421875, "learning_rate": 3.2654798468233656e-07, "loss": 0.0358, "num_tokens": 1810710893.0, "reward": 2.3856027126312256, "reward_std": 0.42100322246551514, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.0940391793847084, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 979.19873046875, "completions/mean_terminated_length": 810.7312622070312, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.6993766980981301, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12560129284198732, "kl": 0.02667236328125, "learning_rate": 3.2625746888147705e-07, "loss": 0.0942, "num_tokens": 1811218230.0, "reward": 2.5518975257873535, "reward_std": 0.37651193141937256, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13276717066764832, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 930.0335083007812, "completions/mean_terminated_length": 736.876953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6995897927654361, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1511039599780612, "kl": 0.027862548828125, "learning_rate": 3.259670769049824e-07, "loss": 0.1434, "num_tokens": 1811704821.0, "reward": 2.4425225257873535, "reward_std": 0.35751911997795105, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396106719971, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10230488330125809, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1108.274658203125, "completions/mean_terminated_length": 858.742919921875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.699802887432742, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11136553704779555, "kl": 0.026123046875, "learning_rate": 3.256768089135625e-07, "loss": 0.0336, "num_tokens": 1812272016.0, "reward": 2.3309152126312256, "reward_std": 0.4128180146217346, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14590215682983398, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 997.58935546875, "completions/mean_terminated_length": 786.3807373046875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7000159821000479, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.125393409447218, "kl": 0.027252197265625, "learning_rate": 3.253866650678584e-07, "loss": 0.085, "num_tokens": 1812796712.0, "reward": 2.4168527126312256, "reward_std": 0.40719953179359436, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1276179552078247, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 988.4308471679688, "completions/mean_terminated_length": 782.16796875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7002290767673539, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13794475752739013, "kl": 0.02886962890625, "learning_rate": 3.250966455284423e-07, "loss": 0.0681, "num_tokens": 1813309481.0, "reward": 2.4966518878936768, "reward_std": 0.4522949755191803, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709372282028198, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 859.6719360351562, "completions/mean_terminated_length": 665.2181396484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.7004421714346598, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16778808787969596, "kl": 0.031646728515625, "learning_rate": 3.248067504558182e-07, "loss": 0.1045, "num_tokens": 1813765862.0, "reward": 2.5078125, "reward_std": 0.445243239402771, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.146973118185997, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 996.716552734375, "completions/mean_terminated_length": 795.4068603515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7006552661019658, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12289475780202898, "kl": 0.026153564453125, "learning_rate": 3.2451698001042073e-07, "loss": 0.0582, "num_tokens": 1814285383.0, "reward": 2.3878350257873535, "reward_std": 0.40453317761421204, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005797147750854, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13566918671131134, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 994.5357666015625, "completions/mean_terminated_length": 837.86669921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7008683607692717, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1354577387269475, "kl": 0.027740478515625, "learning_rate": 3.242273343526154e-07, "loss": 0.0689, "num_tokens": 1814798615.0, "reward": 2.5262277126312256, "reward_std": 0.4395878314971924, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780036240816116, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1063.0335693359375, "completions/mean_terminated_length": 776.3428955078125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7010814554365777, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1240309569726836, "kl": 0.0245361328125, "learning_rate": 3.23937813642699e-07, "loss": 0.0648, "num_tokens": 1815344070.0, "reward": 2.4620537757873535, "reward_std": 0.47612667083740234, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16768723726272583, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 923.63623046875, "completions/mean_terminated_length": 769.5355224609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7012945501038836, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13770741847895607, "kl": 0.028961181640625, "learning_rate": 3.2364841804089956e-07, "loss": 0.0717, "num_tokens": 1815825459.0, "reward": 2.5396206378936768, "reward_std": 0.4038306176662445, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11170817166566849, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 996.51123046875, "completions/mean_terminated_length": 855.42529296875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7015076447711897, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13289246472798402, "kl": 0.02838134765625, "learning_rate": 3.233591477073747e-07, "loss": 0.1051, "num_tokens": 1816340360.0, "reward": 2.439732313156128, "reward_std": 0.4351404905319214, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13271193206310272, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 898.919677734375, "completions/mean_terminated_length": 738.1068725585938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7017207394384956, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14747922268824798, "kl": 0.032470703125, "learning_rate": 3.2307000280221363e-07, "loss": 0.0857, "num_tokens": 1816819812.0, "reward": 2.5496652126312256, "reward_std": 0.4561227262020111, "rewards/accuracy_reward/mean": 0.6785714030265808, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151519536972046, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 991.3058471679688, "completions/mean_terminated_length": 775.4220581054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7019338341058015, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12777596855568987, "kl": 0.02947998046875, "learning_rate": 3.227809834854361e-07, "loss": 0.0807, "num_tokens": 1817328749.0, "reward": 2.4453125, "reward_std": 0.4548998475074768, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.1865052878856659, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 962.5245971679688, "completions/mean_terminated_length": 726.5516357421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7021469287731075, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1384345481062676, "kl": 0.0284423828125, "learning_rate": 3.224920899169922e-07, "loss": 0.0823, "num_tokens": 1817822792.0, "reward": 2.4838171005249023, "reward_std": 0.46545371413230896, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15912973880767822, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1045.509033203125, "completions/mean_terminated_length": 789.9719848632812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7023600234404134, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 1.0905837870370134, "kl": 0.075958251953125, "learning_rate": 3.2220332225676215e-07, "loss": 0.1118, "num_tokens": 1818366172.0, "reward": 2.4291296005249023, "reward_std": 0.4681529402732849, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16403579711914062, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 976.0670166015625, "completions/mean_terminated_length": 787.5643310546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7025731181077194, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12743050237934564, "kl": 0.027984619140625, "learning_rate": 3.2191468066455694e-07, "loss": 0.075, "num_tokens": 1818866426.0, "reward": 2.6729912757873535, "reward_std": 0.43493738770484924, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42408111691474915, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14409084618091583, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1101.30810546875, "completions/mean_terminated_length": 853.3013916015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7027862127750253, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14079531550953045, "kl": 0.027069091796875, "learning_rate": 3.216261653001174e-07, "loss": 0.0873, "num_tokens": 1819431508.0, "reward": 2.3900671005249023, "reward_std": 0.4381040036678314, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1645980179309845, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1002.4285888671875, "completions/mean_terminated_length": 808.80419921875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7029993074423313, "frac_reward_zero_std": 0.0, "grad_norm": 0.13705097501633545, "kl": 0.029296875, "learning_rate": 3.213377763231151e-07, "loss": 0.0711, "num_tokens": 1819946788.0, "reward": 2.318638563156128, "reward_std": 0.5260837078094482, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15933358669281006, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1003.5357666015625, "completions/mean_terminated_length": 779.9241333007812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7032124021096372, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1254106013563999, "kl": 0.02789306640625, "learning_rate": 3.2104951389315073e-07, "loss": 0.0933, "num_tokens": 1820466884.0, "reward": 2.5072546005249023, "reward_std": 0.437931627035141, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12450841069221497, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 990.77685546875, "completions/mean_terminated_length": 774.7849731445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7034254967769431, "frac_reward_zero_std": 0.25, "grad_norm": 0.12925601519596436, "kl": 0.02734375, "learning_rate": 3.2076137816975593e-07, "loss": 0.0673, "num_tokens": 1820978832.0, "reward": 2.3900671005249023, "reward_std": 0.3944319486618042, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.19573107361793518, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1114.0513916015625, "completions/mean_terminated_length": 882.5153198242188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7036385914442491, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.16011009139431864, "kl": 0.025848388671875, "learning_rate": 3.204733693123916e-07, "loss": 0.0795, "num_tokens": 1821545799.0, "reward": 2.3348214626312256, "reward_std": 0.552046000957489, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.18524597585201263, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1186.859375, "completions/mean_terminated_length": 846.1588745117188, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.703851686111555, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12750023586832318, "kl": 0.022796630859375, "learning_rate": 3.201854874804485e-07, "loss": 0.083, "num_tokens": 1822148200.0, "reward": 2.303013563156128, "reward_std": 0.46007663011550903, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.8794642686843872, "rewards/format_reward/std": 0.3259509205818176, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1571800261735916, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 1055.1004638671875, "completions/mean_terminated_length": 794.9887084960938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.704064780778861, "frac_reward_zero_std": 0.0, "grad_norm": 0.14221797129167296, "kl": 0.026275634765625, "learning_rate": 3.1989773283324736e-07, "loss": 0.0912, "num_tokens": 1822687653.0, "reward": 2.3041296005249023, "reward_std": 0.47645753622055054, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.8839285969734192, "rewards/format_reward/std": 0.32066863775253296, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.15283909440040588, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 875.9330444335938, "completions/mean_terminated_length": 751.4913940429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7042778754461669, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13598281697186068, "kl": 0.030303955078125, "learning_rate": 3.1961010553003806e-07, "loss": 0.0685, "num_tokens": 1823148423.0, "reward": 2.490513563156128, "reward_std": 0.3628806173801422, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690935015678406, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.1057254895567894, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 955.8438110351562, "completions/mean_terminated_length": 777.1272583007812, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.704490970113473, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.118936527033922, "kl": 0.028839111328125, "learning_rate": 3.193226057300007e-07, "loss": 0.056, "num_tokens": 1823647841.0, "reward": 2.533482313156128, "reward_std": 0.36338889598846436, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09024729579687119, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1107.0223388671875, "completions/mean_terminated_length": 826.0927734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7047040647807788, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12489503230430565, "kl": 0.02337646484375, "learning_rate": 3.1903523359224416e-07, "loss": 0.1135, "num_tokens": 1824217003.0, "reward": 2.2840402126312256, "reward_std": 0.49713265895843506, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16202186048030853, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 959.7120971679688, "completions/mean_terminated_length": 744.3823852539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7049171594480849, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13976667539604745, "kl": 0.0284423828125, "learning_rate": 3.1874798927580703e-07, "loss": 0.0885, "num_tokens": 1824716506.0, "reward": 2.43359375, "reward_std": 0.44176995754241943, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13700605928897858, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1196.7523193359375, "completions/mean_terminated_length": 892.3666381835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7051302541153908, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.24957462123262, "kl": 0.065155029296875, "learning_rate": 3.1846087293965705e-07, "loss": 0.068, "num_tokens": 1825324011.0, "reward": 2.3130581378936768, "reward_std": 0.44048577547073364, "rewards/accuracy_reward/mean": 0.43287035822868347, "rewards/accuracy_reward/std": 0.4960475564002991, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.12858274579048157, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1117.540283203125, "completions/mean_terminated_length": 863.7784423828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7053433487826967, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12080196882019427, "kl": 0.023529052734375, "learning_rate": 3.1817388474269104e-07, "loss": 0.0969, "num_tokens": 1825899037.0, "reward": 2.4051339626312256, "reward_std": 0.4640944302082062, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13503853976726532, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1146.555908203125, "completions/mean_terminated_length": 880.8121337890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7055564434500027, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12318427952721635, "kl": 0.023834228515625, "learning_rate": 3.17887024843735e-07, "loss": 0.0895, "num_tokens": 1826481734.0, "reward": 2.3158483505249023, "reward_std": 0.40195152163505554, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13814601302146912, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1032.0692138671875, "completions/mean_terminated_length": 758.6600341796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7057695381173086, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13737603757207636, "kl": 0.028106689453125, "learning_rate": 3.1760029340154395e-07, "loss": 0.0925, "num_tokens": 1827014261.0, "reward": 2.3515625, "reward_std": 0.40651756525039673, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17337898910045624, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1134.9398193359375, "completions/mean_terminated_length": 851.9444580078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7059826327846146, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13012824840905396, "kl": 0.02520751953125, "learning_rate": 3.173136905748018e-07, "loss": 0.0709, "num_tokens": 1827596650.0, "reward": 2.3392858505249023, "reward_std": 0.43294811248779297, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14480386674404144, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 961.7567138671875, "completions/mean_terminated_length": 753.7526245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7061957274519205, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14694851496720768, "kl": 0.028076171875, "learning_rate": 3.1702721652212163e-07, "loss": 0.1154, "num_tokens": 1828098445.0, "reward": 2.3370537757873535, "reward_std": 0.4137912094593048, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1554640233516693, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1006.388427734375, "completions/mean_terminated_length": 810.2227783203125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.7064088221192265, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13507016009995815, "kl": 0.027099609375, "learning_rate": 3.167408714020442e-07, "loss": 0.1006, "num_tokens": 1828614683.0, "reward": 2.390625, "reward_std": 0.440521240234375, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1129.3795166015625, "completions/mean_terminated_length": 858.572265625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7066219167865324, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13477461935518678, "kl": 0.023773193359375, "learning_rate": 3.164546553730401e-07, "loss": 0.0955, "num_tokens": 1829185861.0, "reward": 2.4051339626312256, "reward_std": 0.4446280598640442, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1566121131181717, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 950.9129638671875, "completions/mean_terminated_length": 777.9871215820312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7068350114538383, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1375163035637281, "kl": 0.031097412109375, "learning_rate": 3.161685685935077e-07, "loss": 0.0591, "num_tokens": 1829679198.0, "reward": 2.5011162757873535, "reward_std": 0.3659578561782837, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1194591298699379, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1156.57373046875, "completions/mean_terminated_length": 866.4645385742188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7070481061211443, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12488575807549228, "kl": 0.024200439453125, "learning_rate": 3.158826112217747e-07, "loss": 0.0518, "num_tokens": 1830272735.0, "reward": 2.3621652126312256, "reward_std": 0.4514215588569641, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.12074156850576401, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1069.384033203125, "completions/mean_terminated_length": 826.7743530273438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7072612007884502, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11253983843588625, "kl": 0.025604248046875, "learning_rate": 3.1559678341609585e-07, "loss": 0.0824, "num_tokens": 1830816971.0, "reward": 2.4458706378936768, "reward_std": 0.4225183427333832, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1247187927365303, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 984.4732666015625, "completions/mean_terminated_length": 797.4487915039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7074742954557562, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12582896554687586, "kl": 0.025787353515625, "learning_rate": 3.1531108533465557e-07, "loss": 0.0502, "num_tokens": 1831336399.0, "reward": 2.3214287757873535, "reward_std": 0.495141863822937, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1567276567220688, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1029.321533203125, "completions/mean_terminated_length": 801.0928955078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7076873901230621, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12302125028672911, "kl": 0.026580810546875, "learning_rate": 3.150255171355656e-07, "loss": 0.0587, "num_tokens": 1831873087.0, "reward": 2.4034600257873535, "reward_std": 0.47603335976600647, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12381463497877121, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1064.8348388671875, "completions/mean_terminated_length": 789.548583984375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7079004847903682, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12348766134258168, "kl": 0.023284912109375, "learning_rate": 3.1474007897686615e-07, "loss": 0.0515, "num_tokens": 1832427493.0, "reward": 2.4268975257873535, "reward_std": 0.40408816933631897, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681799054145813, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 888.7254638671875, "completions/mean_terminated_length": 709.4561767578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.7081135794576741, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15826759025383902, "kl": 0.032196044921875, "learning_rate": 3.144547710165254e-07, "loss": 0.1183, "num_tokens": 1832894810.0, "reward": 2.509486675262451, "reward_std": 0.40651994943618774, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11395422369241714, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1017.6563110351562, "completions/mean_terminated_length": 776.3911743164062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7083266741249801, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10936487406565053, "kl": 0.02777099609375, "learning_rate": 3.141695934124392e-07, "loss": 0.0915, "num_tokens": 1833419456.0, "reward": 2.54296875, "reward_std": 0.4157126545906067, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1328611671924591, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 972.57373046875, "completions/mean_terminated_length": 773.4205932617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.708539768792286, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13087561067421552, "kl": 0.0283203125, "learning_rate": 3.1388454632243217e-07, "loss": 0.0559, "num_tokens": 1833923233.0, "reward": 2.3934152126312256, "reward_std": 0.4466537833213806, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1280086487531662, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 967.43310546875, "completions/mean_terminated_length": 767.3280029296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7087528634595919, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.21556822016114494, "kl": 0.028533935546875, "learning_rate": 3.1359962990425546e-07, "loss": 0.0913, "num_tokens": 1834433155.0, "reward": 2.4536831378936768, "reward_std": 0.4137965440750122, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.146371990442276, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1018.3951416015625, "completions/mean_terminated_length": 811.3699951171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7089659581268979, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12600631662270473, "kl": 0.027984619140625, "learning_rate": 3.1331484431558853e-07, "loss": 0.0762, "num_tokens": 1834964116.0, "reward": 2.4481027126312256, "reward_std": 0.46774420142173767, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1084.8638916015625, "completions/mean_terminated_length": 829.1158447265625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7091790527942038, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13520939512821195, "kl": 0.02410888671875, "learning_rate": 3.130301897140387e-07, "loss": 0.0938, "num_tokens": 1835523687.0, "reward": 2.3861608505249023, "reward_std": 0.43371132016181946, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12935835123062134, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1055.9085693359375, "completions/mean_terminated_length": 803.0223999023438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7093921474615098, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10952494960144282, "kl": 0.023529052734375, "learning_rate": 3.127456662571405e-07, "loss": 0.0324, "num_tokens": 1836064350.0, "reward": 2.385044813156128, "reward_std": 0.34014779329299927, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10442600399255753, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1059.747802734375, "completions/mean_terminated_length": 838.3360595703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7096052421288157, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12125403815746828, "kl": 0.026153564453125, "learning_rate": 3.1246127410235556e-07, "loss": 0.058, "num_tokens": 1836606973.0, "reward": 2.4012277126312256, "reward_std": 0.4782784581184387, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1013.5469360351562, "completions/mean_terminated_length": 792.07861328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7098183367961217, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1420265675907378, "kl": 0.0286865234375, "learning_rate": 3.1217701340707334e-07, "loss": 0.0812, "num_tokens": 1837125378.0, "reward": 2.454799175262451, "reward_std": 0.3740387260913849, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.14072787761688232, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 967.1495971679688, "completions/mean_terminated_length": 717.7225341796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7100314314634276, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12889704686773856, "kl": 0.029541015625, "learning_rate": 3.1189288432861056e-07, "loss": 0.1189, "num_tokens": 1837623157.0, "reward": 2.4793527126312256, "reward_std": 0.49332547187805176, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12381463497877121, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1026.1763916015625, "completions/mean_terminated_length": 797.2431640625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7102445261307336, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13855725035464447, "kl": 0.02685546875, "learning_rate": 3.1160888702421086e-07, "loss": 0.0884, "num_tokens": 1838151540.0, "reward": 2.4654018878936768, "reward_std": 0.39011961221694946, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12975822389125824, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1190.8326416015625, "completions/mean_terminated_length": 918.555908203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7104576207980395, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10397632559255922, "kl": 0.0224609375, "learning_rate": 3.1132502165104457e-07, "loss": 0.0566, "num_tokens": 1838754009.0, "reward": 2.357701063156128, "reward_std": 0.4094166159629822, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 931.6964721679688, "completions/mean_terminated_length": 735.3910522460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7106707154653454, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.12226213723928774, "kl": 0.02728271484375, "learning_rate": 3.1104128836621e-07, "loss": 0.0777, "num_tokens": 1839246289.0, "reward": 2.33984375, "reward_std": 0.3604331314563751, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 959.7388916015625, "completions/mean_terminated_length": 816.8358764648438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7108838101326515, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1390103376987811, "kl": 0.029144287109375, "learning_rate": 3.1075768732673156e-07, "loss": 0.1006, "num_tokens": 1839742092.0, "reward": 2.587611675262451, "reward_std": 0.43270811438560486, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.46923142671585083, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1049.9754638671875, "completions/mean_terminated_length": 799.0753784179688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7110969047999574, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12199422006775361, "kl": 0.024749755859375, "learning_rate": 3.104742186895608e-07, "loss": 0.0704, "num_tokens": 1840279649.0, "reward": 2.431919813156128, "reward_std": 0.361887663602829, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493200302124, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11055814474821091, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1100.21875, "completions/mean_terminated_length": 795.4749145507812, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7113099994672634, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11430266827547936, "kl": 0.02532958984375, "learning_rate": 3.101908826115758e-07, "loss": 0.029, "num_tokens": 1840840419.0, "reward": 2.3828125, "reward_std": 0.3795602023601532, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14601868391036987, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 916.65185546875, "completions/mean_terminated_length": 741.7009887695312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7115230941345693, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14603279244430828, "kl": 0.03070068359375, "learning_rate": 3.0990767924958133e-07, "loss": 0.0949, "num_tokens": 1841321335.0, "reward": 2.5345983505249023, "reward_std": 0.4367714822292328, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 984.154052734375, "completions/mean_terminated_length": 752.8831787109375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7117361888018753, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.142662817238777, "kl": 0.028106689453125, "learning_rate": 3.0962460876030903e-07, "loss": 0.0845, "num_tokens": 1841834252.0, "reward": 2.5340402126312256, "reward_std": 0.44301551580429077, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12062777578830719, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 975.6272583007812, "completions/mean_terminated_length": 790.3482055664062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7119492834691812, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13259397625033023, "kl": 0.02679443359375, "learning_rate": 3.0934167130041666e-07, "loss": 0.1063, "num_tokens": 1842345829.0, "reward": 2.4927456378936768, "reward_std": 0.47583070397377014, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1377149522304535, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1031.607177734375, "completions/mean_terminated_length": 779.63232421875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7121623781364871, "frac_reward_zero_std": 0.0, "grad_norm": 0.11652767259003653, "kl": 0.025238037109375, "learning_rate": 3.090588670264883e-07, "loss": 0.0551, "num_tokens": 1842872933.0, "reward": 2.4575893878936768, "reward_std": 0.45661118626594543, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12672585248947144, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 895.1428833007812, "completions/mean_terminated_length": 720.2879028320312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7123754728037931, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13516968570754018, "kl": 0.030181884765625, "learning_rate": 3.087761960950345e-07, "loss": 0.0084, "num_tokens": 1843335365.0, "reward": 2.4425225257873535, "reward_std": 0.34387677907943726, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11395422369241714, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1017.4219360351562, "completions/mean_terminated_length": 826.5740356445312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.712588567471099, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11950758731982991, "kl": 0.0264892578125, "learning_rate": 3.0849365866249233e-07, "loss": 0.0719, "num_tokens": 1843861746.0, "reward": 2.5284600257873535, "reward_std": 0.3759201169013977, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09830980002880096, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 959.2210083007812, "completions/mean_terminated_length": 800.4987182617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.712801662138405, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1235019831248256, "kl": 0.030181884765625, "learning_rate": 3.0821125488522426e-07, "loss": 0.0744, "num_tokens": 1844355301.0, "reward": 2.345982313156128, "reward_std": 0.3601638376712799, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12951265275478363, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 999.8638916015625, "completions/mean_terminated_length": 754.4324951171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.7130147568057109, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14013754790379926, "kl": 0.0279541015625, "learning_rate": 3.07928984919519e-07, "loss": 0.0855, "num_tokens": 1844863704.0, "reward": 2.4799108505249023, "reward_std": 0.36846476793289185, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14040927588939667, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1002.450927734375, "completions/mean_terminated_length": 754.060791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7132278514730169, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.16233568333649878, "kl": 0.027191162109375, "learning_rate": 3.076468489215919e-07, "loss": 0.0865, "num_tokens": 1845382674.0, "reward": 2.3861608505249023, "reward_std": 0.40563100576400757, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13318143784999847, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 973.5313110351562, "completions/mean_terminated_length": 804.1705932617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7134409461403228, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13222379850792534, "kl": 0.026824951171875, "learning_rate": 3.0736484704758327e-07, "loss": 0.0479, "num_tokens": 1845883760.0, "reward": 2.4676339626312256, "reward_std": 0.4564989507198334, "rewards/accuracy_reward/mean": 0.5925925970077515, "rewards/accuracy_reward/std": 0.49192148447036743, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559476852417, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1020.9308471679688, "completions/mean_terminated_length": 849.7526245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7136540408076288, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13173485116039826, "kl": 0.0260009765625, "learning_rate": 3.0708297945355975e-07, "loss": 0.0802, "num_tokens": 1846419873.0, "reward": 2.4402902126312256, "reward_std": 0.4642750322818756, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12848563492298126, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1059.185302734375, "completions/mean_terminated_length": 767.6849365234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7138671354749347, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1325594760609868, "kl": 0.0272216796875, "learning_rate": 3.068012462955133e-07, "loss": 0.0741, "num_tokens": 1846964692.0, "reward": 2.412388563156128, "reward_std": 0.48228099942207336, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15266746282577515, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1051.884033203125, "completions/mean_terminated_length": 811.8226928710938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7140802301422406, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1326909796334596, "kl": 0.026336669921875, "learning_rate": 3.065196477293616e-07, "loss": 0.0685, "num_tokens": 1847508160.0, "reward": 2.3671875, "reward_std": 0.5420317053794861, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3310886323451996, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16511768102645874, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 977.0089721679688, "completions/mean_terminated_length": 785.35791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7142933248095467, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1250874401117488, "kl": 0.029693603515625, "learning_rate": 3.0623818391094846e-07, "loss": 0.0604, "num_tokens": 1848013604.0, "reward": 2.427455425262451, "reward_std": 0.4641398787498474, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16796617209911346, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1017.8527221679688, "completions/mean_terminated_length": 769.5900268554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7145064194768526, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13307198861320274, "kl": 0.028167724609375, "learning_rate": 3.0595685499604176e-07, "loss": 0.0541, "num_tokens": 1848530898.0, "reward": 2.4095983505249023, "reward_std": 0.3591146767139435, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11709482222795486, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 986.9397583007812, "completions/mean_terminated_length": 780.3866577148438, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7147195141441586, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1343722109909295, "kl": 0.027099609375, "learning_rate": 3.0567566114033605e-07, "loss": 0.0875, "num_tokens": 1849042599.0, "reward": 2.4363839626312256, "reward_std": 0.4808584153652191, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16906259953975677, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 990.3817138671875, "completions/mean_terminated_length": 770.8759765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7149326088114645, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11637771505213464, "kl": 0.0250244140625, "learning_rate": 3.053946024994505e-07, "loss": 0.0784, "num_tokens": 1849551602.0, "reward": 2.4720983505249023, "reward_std": 0.4218716323375702, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1070.21875, "completions/mean_terminated_length": 800.0056762695312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7151457034787705, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1181530438472655, "kl": 0.0264892578125, "learning_rate": 3.0511367922892957e-07, "loss": 0.0712, "num_tokens": 1850101636.0, "reward": 2.3331475257873535, "reward_std": 0.525721549987793, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16657331585884094, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1026.08935546875, "completions/mean_terminated_length": 776.2888793945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7153587981460764, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12344795420859954, "kl": 0.025238037109375, "learning_rate": 3.048328914842426e-07, "loss": 0.0457, "num_tokens": 1850641276.0, "reward": 2.353794813156128, "reward_std": 0.4471887946128845, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005797147750854, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1728886514902115, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1081.3951416015625, "completions/mean_terminated_length": 864.8333129882812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7155718928133823, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14333011356686176, "kl": 0.027801513671875, "learning_rate": 3.0455223942078424e-07, "loss": 0.0532, "num_tokens": 1851200621.0, "reward": 2.4464287757873535, "reward_std": 0.40338632464408875, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1041.513427734375, "completions/mean_terminated_length": 822.7119750976562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7157849874806883, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10878182717875556, "kl": 0.024688720703125, "learning_rate": 3.0427172319387405e-07, "loss": 0.0347, "num_tokens": 1851737107.0, "reward": 2.4838171005249023, "reward_std": 0.4349047541618347, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15174883604049683, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1005.8125610351562, "completions/mean_terminated_length": 782.6883544921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7159980821479942, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13538172255775882, "kl": 0.028472900390625, "learning_rate": 3.0399134295875607e-07, "loss": 0.0596, "num_tokens": 1852259695.0, "reward": 2.3738839626312256, "reward_std": 0.42971986532211304, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.131288543343544, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 948.263427734375, "completions/mean_terminated_length": 797.5380249023438, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7162111768153002, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12632024327275226, "kl": 0.0279541015625, "learning_rate": 3.0371109887059954e-07, "loss": 0.0777, "num_tokens": 1852752245.0, "reward": 2.6065850257873535, "reward_std": 0.4649077355861664, "rewards/accuracy_reward/mean": 0.7232142686843872, "rewards/accuracy_reward/std": 0.44790980219841003, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14557664096355438, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1013.1183471679688, "completions/mean_terminated_length": 801.6908569335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7164242714826061, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1258924157932152, "kl": 0.02508544921875, "learning_rate": 3.034309910844979e-07, "loss": 0.0759, "num_tokens": 1853277898.0, "reward": 2.3900671005249023, "reward_std": 0.45095714926719666, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14936909079551697, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1102.404052734375, "completions/mean_terminated_length": 884.1895751953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7166373661499121, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13232439371994847, "kl": 0.0255126953125, "learning_rate": 3.0315101975546953e-07, "loss": 0.0885, "num_tokens": 1853850143.0, "reward": 2.330357313156128, "reward_std": 0.5085436701774597, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.1634649783372879, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 956.6563110351562, "completions/mean_terminated_length": 787.8917236328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.716850460817218, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12088820622748457, "kl": 0.02862548828125, "learning_rate": 3.02871185038457e-07, "loss": 0.0591, "num_tokens": 1854348933.0, "reward": 2.4408483505249023, "reward_std": 0.4369667172431946, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709372282028198, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 957.4866333007812, "completions/mean_terminated_length": 748.6648559570312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.717063555484524, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1509645566878022, "kl": 0.02838134765625, "learning_rate": 3.0259148708832717e-07, "loss": 0.1007, "num_tokens": 1854843807.0, "reward": 2.4871652126312256, "reward_std": 0.4313191771507263, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4846842288970947, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1654377579689026, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 854.2879638671875, "completions/mean_terminated_length": 669.6932983398438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.71727665015183, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13238180384958176, "kl": 0.031219482421875, "learning_rate": 3.023119260598721e-07, "loss": 0.0717, "num_tokens": 1855297440.0, "reward": 2.5474331378936768, "reward_std": 0.35852569341659546, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11682131141424179, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 915.9285888671875, "completions/mean_terminated_length": 706.2857055664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7174897448191359, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1480645840062063, "kl": 0.031982421875, "learning_rate": 3.020325021078069e-07, "loss": 0.12, "num_tokens": 1855772320.0, "reward": 2.5306921005249023, "reward_std": 0.4434750974178314, "rewards/accuracy_reward/mean": 0.6574074029922485, "rewards/accuracy_reward/std": 0.4751267731189728, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15343420207500458, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1013.6451416015625, "completions/mean_terminated_length": 774.9478149414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7177028394864419, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13590651085789632, "kl": 0.027191162109375, "learning_rate": 3.017532153867716e-07, "loss": 0.1053, "num_tokens": 1856299233.0, "reward": 2.419642925262451, "reward_std": 0.4332006573677063, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14714714884757996, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 964.8527221679688, "completions/mean_terminated_length": 767.6570434570312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7179159341537478, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11508799739620248, "kl": 0.02679443359375, "learning_rate": 3.014740660513298e-07, "loss": 0.0679, "num_tokens": 1856791231.0, "reward": 2.3582589626312256, "reward_std": 0.3488439917564392, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15391045808792114, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 952.0803833007812, "completions/mean_terminated_length": 814.4019775390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7181290288210538, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13211528006286527, "kl": 0.0284423828125, "learning_rate": 3.0119505425596926e-07, "loss": 0.04, "num_tokens": 1857290067.0, "reward": 2.5161831378936768, "reward_std": 0.39218929409980774, "rewards/accuracy_reward/mean": 0.6180555820465088, "rewards/accuracy_reward/std": 0.48642635345458984, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.09263478219509125, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1046.046875, "completions/mean_terminated_length": 790.6470947265625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7183421234883597, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13010871659163667, "kl": 0.025665283203125, "learning_rate": 3.009161801551022e-07, "loss": 0.0769, "num_tokens": 1857825448.0, "reward": 2.3872768878936768, "reward_std": 0.48215585947036743, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1464626044034958, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 871.2924194335938, "completions/mean_terminated_length": 710.0177612304688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7185552181556657, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.148438393209212, "kl": 0.0308837890625, "learning_rate": 3.006374439030633e-07, "loss": 0.0979, "num_tokens": 1858279643.0, "reward": 2.533482313156128, "reward_std": 0.4409452974796295, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15048591792583466, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 986.7813110351562, "completions/mean_terminated_length": 793.577880859375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7187683128229716, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12587387667349265, "kl": 0.027984619140625, "learning_rate": 3.003588456541123e-07, "loss": 0.0864, "num_tokens": 1858787449.0, "reward": 2.4598214626312256, "reward_std": 0.41850295662879944, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13526484370231628, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1104.102783203125, "completions/mean_terminated_length": 750.864990234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.7189814074902775, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12169853656544591, "kl": 0.027557373046875, "learning_rate": 3.000803855624318e-07, "loss": 0.0349, "num_tokens": 1859352151.0, "reward": 2.2533483505249023, "reward_std": 0.4910773038864136, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18940123915672302, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 794.0670166015625, "completions/mean_terminated_length": 643.594970703125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7191945021575835, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1574005011889106, "kl": 0.03558349609375, "learning_rate": 2.9980206378212824e-07, "loss": 0.0756, "num_tokens": 1859777893.0, "reward": 2.6356027126312256, "reward_std": 0.42525947093963623, "rewards/accuracy_reward/mean": 0.7321428656578064, "rewards/accuracy_reward/std": 0.4433377981185913, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.15223345160484314, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 928.15185546875, "completions/mean_terminated_length": 731.2230834960938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7194075968248894, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1450262779853937, "kl": 0.03265380859375, "learning_rate": 2.9952388046723133e-07, "loss": 0.1113, "num_tokens": 1860260185.0, "reward": 2.568080425262451, "reward_std": 0.40151655673980713, "rewards/accuracy_reward/mean": 0.6808035969734192, "rewards/accuracy_reward/std": 0.4666863977909088, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14601868391036987, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 999.7835083007812, "completions/mean_terminated_length": 818.6780395507812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7196206914921954, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.128861760934775, "kl": 0.0272216796875, "learning_rate": 2.9924583577169404e-07, "loss": 0.0517, "num_tokens": 1860780040.0, "reward": 2.4620537757873535, "reward_std": 0.3908250629901886, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967316269874573, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 963.0067138671875, "completions/mean_terminated_length": 772.2073364257812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7198337861595013, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14201917937765637, "kl": 0.02947998046875, "learning_rate": 2.9896792984939346e-07, "loss": 0.1067, "num_tokens": 1861276075.0, "reward": 2.4642858505249023, "reward_std": 0.4351259171962738, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16231536865234375, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1013.5692138671875, "completions/mean_terminated_length": 805.57373046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.7200468808268073, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13663790131044107, "kl": 0.028900146484375, "learning_rate": 2.9869016285412853e-07, "loss": 0.0597, "num_tokens": 1861798042.0, "reward": 2.4754464626312256, "reward_std": 0.47384026646614075, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12040118128061295, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1054.388427734375, "completions/mean_terminated_length": 848.1671142578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7202599754941132, "frac_reward_zero_std": 0.0, "grad_norm": 0.1348566846109887, "kl": 0.0283203125, "learning_rate": 2.9841253493962235e-07, "loss": 0.0794, "num_tokens": 1862341896.0, "reward": 2.470424175262451, "reward_std": 0.4618629813194275, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416089117527008, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1022.7522583007812, "completions/mean_terminated_length": 772.1361083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7204730701614193, "frac_reward_zero_std": 0.0, "grad_norm": 0.3725021170002289, "kl": 0.027496337890625, "learning_rate": 2.981350462595209e-07, "loss": 0.046, "num_tokens": 1862873593.0, "reward": 2.411830425262451, "reward_std": 0.40283289551734924, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 927.5938110351562, "completions/mean_terminated_length": 750.9922485351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7206861648287252, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1437815809805686, "kl": 0.0301513671875, "learning_rate": 2.978576969673926e-07, "loss": 0.1365, "num_tokens": 1863362147.0, "reward": 2.431919813156128, "reward_std": 0.4461188316345215, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9520089030265808, "rewards/tag_count_reward/std": 0.17438411712646484, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 993.4754638671875, "completions/mean_terminated_length": 753.679443359375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7208992594960311, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13574330888021796, "kl": 0.027069091796875, "learning_rate": 2.9758048721672917e-07, "loss": 0.068, "num_tokens": 1863875160.0, "reward": 2.392857313156128, "reward_std": 0.3938256800174713, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 929.0826416015625, "completions/mean_terminated_length": 718.3580932617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7211123541633371, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14170788217378083, "kl": 0.027801513671875, "learning_rate": 2.973034171609449e-07, "loss": 0.1309, "num_tokens": 1864355453.0, "reward": 2.5200893878936768, "reward_std": 0.4428699016571045, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14140157401561737, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1085.997802734375, "completions/mean_terminated_length": 854.1578979492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.721325448830643, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10269392190679648, "kl": 0.025238037109375, "learning_rate": 2.970264869533771e-07, "loss": 0.0354, "num_tokens": 1864915468.0, "reward": 2.3364956378936768, "reward_std": 0.3664836585521698, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.09722442179918289, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 895.560302734375, "completions/mean_terminated_length": 713.9096069335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.721538543497949, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14456626840655323, "kl": 0.03448486328125, "learning_rate": 2.9674969674728546e-07, "loss": 0.0742, "num_tokens": 1865381751.0, "reward": 2.53515625, "reward_std": 0.42612650990486145, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416090607643127, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 926.4866333007812, "completions/mean_terminated_length": 782.4130859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7217516381652549, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12529746810743825, "kl": 0.029388427734375, "learning_rate": 2.964730466958517e-07, "loss": 0.0678, "num_tokens": 1865870545.0, "reward": 2.5027902126312256, "reward_std": 0.37594395875930786, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13485699892044067, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 928.9263916015625, "completions/mean_terminated_length": 678.2048950195312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7219647328325609, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12656152216409805, "kl": 0.0311279296875, "learning_rate": 2.961965369521809e-07, "loss": 0.0661, "num_tokens": 1866350976.0, "reward": 2.46875, "reward_std": 0.3738847076892853, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1039.493408203125, "completions/mean_terminated_length": 764.446044921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7221778274998668, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11924420041602478, "kl": 0.027679443359375, "learning_rate": 2.9592016766929996e-07, "loss": 0.0268, "num_tokens": 1866883437.0, "reward": 2.400111675262451, "reward_std": 0.4034498929977417, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14921021461486816, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 917.7388916015625, "completions/mean_terminated_length": 739.583984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7223909221671727, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1374622423647146, "kl": 0.030792236328125, "learning_rate": 2.95643939000158e-07, "loss": 0.078, "num_tokens": 1867367832.0, "reward": 2.4988839626312256, "reward_std": 0.34756582975387573, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.08887429535388947, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 981.0647583007812, "completions/mean_terminated_length": 727.5939331054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7226040168344787, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1314783628689341, "kl": 0.029571533203125, "learning_rate": 2.953678510976265e-07, "loss": 0.0069, "num_tokens": 1867878853.0, "reward": 2.4347100257873535, "reward_std": 0.3679303824901581, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14323382079601288, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1042.930908203125, "completions/mean_terminated_length": 814.380859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7228171115017846, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.20629654114684143, "kl": 0.0592041015625, "learning_rate": 2.950919041144988e-07, "loss": 0.069, "num_tokens": 1868420742.0, "reward": 2.443638563156128, "reward_std": 0.3858453333377838, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09244592487812042, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 870.9598388671875, "completions/mean_terminated_length": 719.7531127929688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7230302061690906, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15618052955148484, "kl": 0.031829833984375, "learning_rate": 2.9481609820349085e-07, "loss": 0.0788, "num_tokens": 1868881796.0, "reward": 2.4799108505249023, "reward_std": 0.37334492802619934, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11531686037778854, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1067.6317138671875, "completions/mean_terminated_length": 796.7036743164062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7232433008363965, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12719264167274025, "kl": 0.028472900390625, "learning_rate": 2.945404335172399e-07, "loss": 0.0709, "num_tokens": 1869428639.0, "reward": 2.4051339626312256, "reward_std": 0.4426869750022888, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14457522332668304, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 973.0692138671875, "completions/mean_terminated_length": 780.7131958007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7234563955037026, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13510886682063347, "kl": 0.0284423828125, "learning_rate": 2.942649102083051e-07, "loss": 0.0985, "num_tokens": 1869935102.0, "reward": 2.5552456378936768, "reward_std": 0.45971906185150146, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969305515289307, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1475696861743927, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1137.5, "completions/mean_terminated_length": 811.92724609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7236694901710085, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11903697103639525, "kl": 0.02264404296875, "learning_rate": 2.939895284291677e-07, "loss": 0.0573, "num_tokens": 1870516462.0, "reward": 2.3861608505249023, "reward_std": 0.3681625723838806, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09910815209150314, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1004.62060546875, "completions/mean_terminated_length": 781.2412109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7238825848383145, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1373640118790415, "kl": 0.028656005859375, "learning_rate": 2.9371428833223056e-07, "loss": 0.0539, "num_tokens": 1871031524.0, "reward": 2.5200893878936768, "reward_std": 0.43982771039009094, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14816173911094666, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 981.5625610351562, "completions/mean_terminated_length": 756.7459716796875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7240956795056204, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1359291534914912, "kl": 0.02740478515625, "learning_rate": 2.9343919006981763e-07, "loss": 0.0797, "num_tokens": 1871541424.0, "reward": 2.4654018878936768, "reward_std": 0.38752931356430054, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1401556432247162, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 952.2678833007812, "completions/mean_terminated_length": 776.2694091796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7243087741729263, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14431950148324868, "kl": 0.031097412109375, "learning_rate": 2.931642337941749e-07, "loss": 0.098, "num_tokens": 1872036328.0, "reward": 2.46875, "reward_std": 0.42768269777297974, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15220165252685547, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1067.9598388671875, "completions/mean_terminated_length": 814.6910400390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7245218688402323, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13226005065580296, "kl": 0.0242919921875, "learning_rate": 2.928894196574697e-07, "loss": 0.121, "num_tokens": 1872583670.0, "reward": 2.4838171005249023, "reward_std": 0.4587395191192627, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661914110183716, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 835.1763916015625, "completions/mean_terminated_length": 682.8115234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7247349635075382, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12861472321939502, "kl": 0.029632568359375, "learning_rate": 2.9261474781179075e-07, "loss": 0.0461, "num_tokens": 1873029493.0, "reward": 2.493861675262451, "reward_std": 0.38119107484817505, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 895.3839721679688, "completions/mean_terminated_length": 671.0079956054688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7249480581748442, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14006947485055238, "kl": 0.031707763671875, "learning_rate": 2.923402184091476e-07, "loss": 0.0811, "num_tokens": 1873491713.0, "reward": 2.5005581378936768, "reward_std": 0.4125838577747345, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1514935940504074, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1052.493408203125, "completions/mean_terminated_length": 822.760986328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7251611528421501, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12464653839890166, "kl": 0.0294189453125, "learning_rate": 2.9206583160147133e-07, "loss": 0.0461, "num_tokens": 1874028174.0, "reward": 2.310826063156128, "reward_std": 0.4906337857246399, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1611565798521042, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 983.6607666015625, "completions/mean_terminated_length": 789.8892211914062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7253742475094561, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14022095706034168, "kl": 0.030426025390625, "learning_rate": 2.9179158754061405e-07, "loss": 0.0727, "num_tokens": 1874539398.0, "reward": 2.484375, "reward_std": 0.40690091252326965, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13686132431030273, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 898.2366333007812, "completions/mean_terminated_length": 750.5339965820312, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.725587342176762, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12909018496604843, "kl": 0.029815673828125, "learning_rate": 2.9151748637834923e-07, "loss": 0.1061, "num_tokens": 1875005376.0, "reward": 2.6356027126312256, "reward_std": 0.3758071959018707, "rewards/accuracy_reward/mean": 0.7209821343421936, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519908487796783, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1033.477783203125, "completions/mean_terminated_length": 799.357177734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.725800436844068, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11217433270572724, "kl": 0.026519775390625, "learning_rate": 2.9124352826637015e-07, "loss": 0.0372, "num_tokens": 1875540534.0, "reward": 2.4676339626312256, "reward_std": 0.42221924662590027, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12848198413848877, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1137.5648193359375, "completions/mean_terminated_length": 855.383056640625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.7260135315113739, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10470909152618864, "kl": 0.023773193359375, "learning_rate": 2.9096971335629227e-07, "loss": 0.0782, "num_tokens": 1876113763.0, "reward": 2.361049175262451, "reward_std": 0.38826024532318115, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1489589959383011, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1032.3192138671875, "completions/mean_terminated_length": 791.02490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7262266261786798, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.12516592055488948, "kl": 0.02801513671875, "learning_rate": 2.906960417996509e-07, "loss": 0.0507, "num_tokens": 1876648338.0, "reward": 2.4174108505249023, "reward_std": 0.3114190101623535, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1077.091552734375, "completions/mean_terminated_length": 812.29833984375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7264397208459858, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13383448456698407, "kl": 0.024749755859375, "learning_rate": 2.904225137479024e-07, "loss": 0.1346, "num_tokens": 1877194171.0, "reward": 2.349888563156128, "reward_std": 0.4980581998825073, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14275364577770233, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 974.13623046875, "completions/mean_terminated_length": 771.8965454101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7266528155132917, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14558345691938124, "kl": 0.028564453125, "learning_rate": 2.901491293524236e-07, "loss": 0.1216, "num_tokens": 1877703736.0, "reward": 2.470982313156128, "reward_std": 0.45858171582221985, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16402913630008698, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 946.74560546875, "completions/mean_terminated_length": 792.6259765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7268659101805978, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11547998785104664, "kl": 0.029693603515625, "learning_rate": 2.898758887645116e-07, "loss": 0.0622, "num_tokens": 1878199798.0, "reward": 2.4871652126312256, "reward_std": 0.40343886613845825, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1051.477783203125, "completions/mean_terminated_length": 818.1322631835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7270790048479037, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12519129143993266, "kl": 0.026947021484375, "learning_rate": 2.8960279213538463e-07, "loss": 0.097, "num_tokens": 1878739500.0, "reward": 2.4698662757873535, "reward_std": 0.4294077455997467, "rewards/accuracy_reward/mean": 0.5763888955116272, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 957.9710083007812, "completions/mean_terminated_length": 731.738525390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7272920995152097, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14688524122295993, "kl": 0.0283203125, "learning_rate": 2.893298396161805e-07, "loss": 0.113, "num_tokens": 1879241039.0, "reward": 2.3521206378936768, "reward_std": 0.48625805974006653, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17125900089740753, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1065.296875, "completions/mean_terminated_length": 793.7236328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7275051941825156, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12837300590149925, "kl": 0.027923583984375, "learning_rate": 2.8905703135795745e-07, "loss": 0.0906, "num_tokens": 1879791956.0, "reward": 2.256138563156128, "reward_std": 0.4179733097553253, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14801737666130066, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 896.4866333007812, "completions/mean_terminated_length": 704.5677490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7277182888498215, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14254997194340524, "kl": 0.02886962890625, "learning_rate": 2.887843675116941e-07, "loss": 0.1022, "num_tokens": 1880261758.0, "reward": 2.4308037757873535, "reward_std": 0.3681824505329132, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11923423409461975, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1101.41748046875, "completions/mean_terminated_length": 836.374267578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7279313835171275, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11386296398519406, "kl": 0.025726318359375, "learning_rate": 2.88511848228289e-07, "loss": 0.0686, "num_tokens": 1880824409.0, "reward": 2.4190850257873535, "reward_std": 0.422359824180603, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15667887032032013, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 975.5245971679688, "completions/mean_terminated_length": 728.0302124023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7281444781844334, "frac_reward_zero_std": 0.25, "grad_norm": 0.12697691856832877, "kl": 0.030242919921875, "learning_rate": 2.8823947365856064e-07, "loss": 0.0405, "num_tokens": 1881323364.0, "reward": 2.400111675262451, "reward_std": 0.3837801516056061, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151519536972046, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 945.30810546875, "completions/mean_terminated_length": 758.1671142578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7283575728517394, "frac_reward_zero_std": 0.0, "grad_norm": 0.332477326817764, "kl": 0.04132080078125, "learning_rate": 2.879672439532474e-07, "loss": 0.0984, "num_tokens": 1881811918.0, "reward": 2.4095983505249023, "reward_std": 0.501810610294342, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.15942463278770447, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 976.9710083007812, "completions/mean_terminated_length": 771.8803100585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7285706675190453, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12300772630348249, "kl": 0.029052734375, "learning_rate": 2.876951592630079e-07, "loss": 0.0712, "num_tokens": 1882318913.0, "reward": 2.407924175262451, "reward_std": 0.3882731795310974, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14323382079601288, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1013.700927734375, "completions/mean_terminated_length": 818.9124755859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7287837621863513, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12796250893393432, "kl": 0.028717041015625, "learning_rate": 2.874232197384201e-07, "loss": 0.0501, "num_tokens": 1882837739.0, "reward": 2.46875, "reward_std": 0.3944890797138214, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1049.904052734375, "completions/mean_terminated_length": 816.1901245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7289968568536572, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12971459548470377, "kl": 0.0279541015625, "learning_rate": 2.871514255299815e-07, "loss": 0.0406, "num_tokens": 1883380672.0, "reward": 2.36328125, "reward_std": 0.46721577644348145, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16796153783798218, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 975.3594360351562, "completions/mean_terminated_length": 806.286865234375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7292099515209632, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11805064951570657, "kl": 0.02899169921875, "learning_rate": 2.8687977678810965e-07, "loss": 0.0224, "num_tokens": 1883886289.0, "reward": 2.506138563156128, "reward_std": 0.37737151980400085, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1040.8192138671875, "completions/mean_terminated_length": 838.302978515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7294230461882691, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13589890123814963, "kl": 0.026702880859375, "learning_rate": 2.86608273663141e-07, "loss": 0.0741, "num_tokens": 1884425792.0, "reward": 2.3448662757873535, "reward_std": 0.40496307611465454, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13064035773277283, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 926.4263916015625, "completions/mean_terminated_length": 700.9088745117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.729636140855575, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13125399988346317, "kl": 0.03070068359375, "learning_rate": 2.863369163053323e-07, "loss": 0.0688, "num_tokens": 1884915807.0, "reward": 2.4810268878936768, "reward_std": 0.36501941084861755, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.11662477999925613, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 944.6495971679688, "completions/mean_terminated_length": 743.7757568359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7298492355228811, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12733418155021453, "kl": 0.029296875, "learning_rate": 2.860657048648584e-07, "loss": 0.059, "num_tokens": 1885407538.0, "reward": 2.4536831378936768, "reward_std": 0.36568281054496765, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11109180748462677, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 920.37060546875, "completions/mean_terminated_length": 781.8897094726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.730062330190187, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11131642959655688, "kl": 0.03009033203125, "learning_rate": 2.8579463949181455e-07, "loss": 0.0686, "num_tokens": 1885886232.0, "reward": 2.58203125, "reward_std": 0.382823646068573, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13463464379310608, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1110.2723388671875, "completions/mean_terminated_length": 819.631591796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.730275424857493, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12969345318674996, "kl": 0.02398681640625, "learning_rate": 2.855237203362146e-07, "loss": 0.0961, "num_tokens": 1886457618.0, "reward": 2.35546875, "reward_std": 0.44602110981941223, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16145840287208557, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1061.3148193359375, "completions/mean_terminated_length": 802.8309326171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7304885195247989, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1345681550978294, "kl": 0.027679443359375, "learning_rate": 2.8525294754799145e-07, "loss": 0.1109, "num_tokens": 1887007375.0, "reward": 2.4341518878936768, "reward_std": 0.49060893058776855, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15339045226573944, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 891.85498046875, "completions/mean_terminated_length": 706.15283203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7307016141921049, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13442112984228843, "kl": 0.03326416015625, "learning_rate": 2.8498232127699725e-07, "loss": 0.0513, "num_tokens": 1887472206.0, "reward": 2.3643975257873535, "reward_std": 0.39711326360702515, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1082.7523193359375, "completions/mean_terminated_length": 826.4434814453125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7309147088594108, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10795708636100353, "kl": 0.0240478515625, "learning_rate": 2.8471184167300253e-07, "loss": 0.0928, "num_tokens": 1888020367.0, "reward": 2.3392858505249023, "reward_std": 0.43019357323646545, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1059.352783203125, "completions/mean_terminated_length": 831.2033081054688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7311278035267167, "frac_reward_zero_std": 0.0, "grad_norm": 0.11889543676058181, "kl": 0.027008056640625, "learning_rate": 2.844415088856978e-07, "loss": 0.0393, "num_tokens": 1888559565.0, "reward": 2.4609375, "reward_std": 0.43998026847839355, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12758491933345795, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1007.4888916015625, "completions/mean_terminated_length": 814.8015747070312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7313408981940227, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12055097754386625, "kl": 0.02777099609375, "learning_rate": 2.8417132306469084e-07, "loss": 0.063, "num_tokens": 1889078920.0, "reward": 2.4185268878936768, "reward_std": 0.41142937541007996, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.11899843066930771, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1000.7366333007812, "completions/mean_terminated_length": 711.3219604492188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7315539928613286, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.127692054687112, "kl": 0.0277099609375, "learning_rate": 2.8390128435950926e-07, "loss": 0.0696, "num_tokens": 1889601666.0, "reward": 2.33203125, "reward_std": 0.467107355594635, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.17351123690605164, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 925.5245971679688, "completions/mean_terminated_length": 745.2305297851562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7317670875286346, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13244525952250213, "kl": 0.026580810546875, "learning_rate": 2.836313929195987e-07, "loss": 0.0496, "num_tokens": 1890088925.0, "reward": 2.4542412757873535, "reward_std": 0.3674367666244507, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1008.10498046875, "completions/mean_terminated_length": 795.6531982421875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7319801821959405, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11900216531987744, "kl": 0.0299072265625, "learning_rate": 2.8336164889432323e-07, "loss": 0.0281, "num_tokens": 1890608620.0, "reward": 2.49609375, "reward_std": 0.39869406819343567, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13180458545684814, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1016.6763916015625, "completions/mean_terminated_length": 785.61474609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7321932768632465, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12972400230234757, "kl": 0.0277099609375, "learning_rate": 2.830920524329658e-07, "loss": 0.0873, "num_tokens": 1891133083.0, "reward": 2.4603796005249023, "reward_std": 0.39249271154403687, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15853238105773926, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1072.046875, "completions/mean_terminated_length": 769.5584716796875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7324063715305524, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1547349320933971, "kl": 0.02862548828125, "learning_rate": 2.828226036847271e-07, "loss": 0.0957, "num_tokens": 1891680960.0, "reward": 2.310267925262451, "reward_std": 0.47386637330055237, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1620074063539505, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1125.075927734375, "completions/mean_terminated_length": 849.5362548828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7326194661978584, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11588134102096417, "kl": 0.02362060546875, "learning_rate": 2.825533027987267e-07, "loss": 0.0916, "num_tokens": 1892252562.0, "reward": 2.4503350257873535, "reward_std": 0.4494834840297699, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1117.7835693359375, "completions/mean_terminated_length": 893.6038818359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7328325608651644, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12306119461212686, "kl": 0.02740478515625, "learning_rate": 2.8228414992400217e-07, "loss": 0.1132, "num_tokens": 1892815761.0, "reward": 2.3956475257873535, "reward_std": 0.5686297416687012, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.170974463224411, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1064.122802734375, "completions/mean_terminated_length": 799.3399047851562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7330456555324703, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13123760740262957, "kl": 0.02606201171875, "learning_rate": 2.820151452095083e-07, "loss": 0.1018, "num_tokens": 1893369560.0, "reward": 2.4341518878936768, "reward_std": 0.4745834469795227, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16057941317558289, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 986.5067138671875, "completions/mean_terminated_length": 759.2493286132812, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7332587501997763, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12208238629650009, "kl": 0.026336669921875, "learning_rate": 2.817462888041193e-07, "loss": 0.0947, "num_tokens": 1893879979.0, "reward": 2.4609375, "reward_std": 0.4501286447048187, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13915444910526276, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1039.134033203125, "completions/mean_terminated_length": 826.4541015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7334718448670822, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11353960645070311, "kl": 0.023681640625, "learning_rate": 2.814775808566263e-07, "loss": 0.0331, "num_tokens": 1894415815.0, "reward": 2.478236675262451, "reward_std": 0.3729163110256195, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09668362140655518, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 895.638427734375, "completions/mean_terminated_length": 788.8341674804688, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.7336849395343882, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14788515588120468, "kl": 0.032745361328125, "learning_rate": 2.812090215157388e-07, "loss": 0.0939, "num_tokens": 1894885125.0, "reward": 2.5825893878936768, "reward_std": 0.43872371315956116, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.46403056383132935, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 967.0781860351562, "completions/mean_terminated_length": 776.9947509765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7338980342016941, "frac_reward_zero_std": 0.0, "grad_norm": 0.14387623739469202, "kl": 0.028533935546875, "learning_rate": 2.809406109300834e-07, "loss": 0.0971, "num_tokens": 1895387576.0, "reward": 2.46484375, "reward_std": 0.5205824971199036, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.18014755845069885, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1071.3773193359375, "completions/mean_terminated_length": 764.9296264648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7341111288690001, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13057785123646307, "kl": 0.0240478515625, "learning_rate": 2.806723492482053e-07, "loss": 0.0976, "num_tokens": 1895942129.0, "reward": 2.404576063156128, "reward_std": 0.5068159103393555, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16403579711914062, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1103.997802734375, "completions/mean_terminated_length": 804.1382446289062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.734324223536306, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12612839498942743, "kl": 0.024810791015625, "learning_rate": 2.804042366185667e-07, "loss": 0.0779, "num_tokens": 1896503616.0, "reward": 2.368861675262451, "reward_std": 0.45546820759773254, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.1728263646364212, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1110.015625, "completions/mean_terminated_length": 840.4798583984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7345373182036119, "frac_reward_zero_std": 0.0, "grad_norm": 0.12849803606219865, "kl": 0.024688720703125, "learning_rate": 2.8013627318954723e-07, "loss": 0.0882, "num_tokens": 1897069575.0, "reward": 2.2354912757873535, "reward_std": 0.43667352199554443, "rewards/accuracy_reward/mean": 0.37731480598449707, "rewards/accuracy_reward/std": 0.4852766990661621, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16988763213157654, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 964.7254638671875, "completions/mean_terminated_length": 722.0245971679688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7347504128709179, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14044283063040156, "kl": 0.02764892578125, "learning_rate": 2.7986845910944433e-07, "loss": 0.1366, "num_tokens": 1897570284.0, "reward": 2.447544813156128, "reward_std": 0.45131516456604004, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16988763213157654, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 988.7545166015625, "completions/mean_terminated_length": 808.9869384765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7349635075382238, "frac_reward_zero_std": 0.0, "grad_norm": 0.1522361476399345, "kl": 0.029388427734375, "learning_rate": 2.7960079452647223e-07, "loss": 0.097, "num_tokens": 1898082286.0, "reward": 2.380580425262451, "reward_std": 0.5102304220199585, "rewards/accuracy_reward/mean": 0.5092592835426331, "rewards/accuracy_reward/std": 0.5004938840866089, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16460275650024414, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 883.72998046875, "completions/mean_terminated_length": 693.2129516601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7351766022055298, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1403254926354319, "kl": 0.03173828125, "learning_rate": 2.7933327958876353e-07, "loss": 0.099, "num_tokens": 1898544373.0, "reward": 2.5011162757873535, "reward_std": 0.39355558156967163, "rewards/accuracy_reward/mean": 0.6111111044883728, "rewards/accuracy_reward/std": 0.488063246011734, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559476852417, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1010.7031860351562, "completions/mean_terminated_length": 798.7822875976562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7353896968728357, "frac_reward_zero_std": 0.0, "grad_norm": 0.13476578623333105, "kl": 0.027618408203125, "learning_rate": 2.790659144443665e-07, "loss": 0.0865, "num_tokens": 1899062496.0, "reward": 2.5033483505249023, "reward_std": 0.4906647503376007, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15788236260414124, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1025.3638916015625, "completions/mean_terminated_length": 813.1185913085938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7356027915401417, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13425074228996248, "kl": 0.027587890625, "learning_rate": 2.7879869924124756e-07, "loss": 0.1036, "num_tokens": 1899590259.0, "reward": 2.369419813156128, "reward_std": 0.43779218196868896, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15391045808792114, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1128.1273193359375, "completions/mean_terminated_length": 828.7603759765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7358158862074476, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12962469915678396, "kl": 0.024993896484375, "learning_rate": 2.7853163412729e-07, "loss": 0.1006, "num_tokens": 1900165532.0, "reward": 2.3543527126312256, "reward_std": 0.44905099272727966, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14025692641735077, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1066.7388916015625, "completions/mean_terminated_length": 791.9857177734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7360289808747537, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13704990446898468, "kl": 0.02703857421875, "learning_rate": 2.782647192502938e-07, "loss": 0.1069, "num_tokens": 1900708487.0, "reward": 2.41796875, "reward_std": 0.49144795536994934, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.17302128672599792, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 958.3348388671875, "completions/mean_terminated_length": 766.7139282226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7362420755420596, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13511312582994642, "kl": 0.029541015625, "learning_rate": 2.779979547579759e-07, "loss": 0.0464, "num_tokens": 1901201677.0, "reward": 2.3582589626312256, "reward_std": 0.4277730882167816, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1468711644411087, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1149.2254638671875, "completions/mean_terminated_length": 856.724853515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7364551702093655, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12135483923037568, "kl": 0.02593994140625, "learning_rate": 2.7773134079797e-07, "loss": 0.1183, "num_tokens": 1901790834.0, "reward": 2.3705358505249023, "reward_std": 0.4890635311603546, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.17407242953777313, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1024.4710693359375, "completions/mean_terminated_length": 777.8032836914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.7366682648766715, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12016402931133427, "kl": 0.02642822265625, "learning_rate": 2.7746487751782666e-07, "loss": 0.0266, "num_tokens": 1902324405.0, "reward": 2.3895089626312256, "reward_std": 0.41061267256736755, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1421368569135666, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 948.7545166015625, "completions/mean_terminated_length": 782.0308227539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7368813595439774, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11516075086670463, "kl": 0.02825927734375, "learning_rate": 2.7719856506501306e-07, "loss": 0.0744, "num_tokens": 1902814423.0, "reward": 2.493861675262451, "reward_std": 0.408820778131485, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517839670181274, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1186.263427734375, "completions/mean_terminated_length": 885.1746826171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.7370944542112834, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11188611030742304, "kl": 0.0211181640625, "learning_rate": 2.7693240358691216e-07, "loss": 0.1032, "num_tokens": 1903415533.0, "reward": 2.341517925262451, "reward_std": 0.44203147292137146, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17942628264427185, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1090.5648193359375, "completions/mean_terminated_length": 891.8517456054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7373075488785893, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12408047414546623, "kl": 0.026153564453125, "learning_rate": 2.766663932308244e-07, "loss": 0.0641, "num_tokens": 1903973114.0, "reward": 2.3588171005249023, "reward_std": 0.5045943260192871, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15252020955085754, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1041.930908203125, "completions/mean_terminated_length": 771.1755981445312, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7375206435458953, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1234701359575885, "kl": 0.025848388671875, "learning_rate": 2.7640053414396626e-07, "loss": 0.0644, "num_tokens": 1904509115.0, "reward": 2.3856027126312256, "reward_std": 0.44344204664230347, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11418405175209045, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1036.7991943359375, "completions/mean_terminated_length": 757.3504028320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7377337382132012, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2512970781528178, "kl": 0.027618408203125, "learning_rate": 2.761348264734701e-07, "loss": 0.0627, "num_tokens": 1905054257.0, "reward": 2.330357313156128, "reward_std": 0.4192860722541809, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.128681018948555, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 953.7277221679688, "completions/mean_terminated_length": 747.64453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7379468328805072, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13757513157259688, "kl": 0.027252197265625, "learning_rate": 2.7586927036638494e-07, "loss": 0.0908, "num_tokens": 1905553335.0, "reward": 2.4056921005249023, "reward_std": 0.36265894770622253, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780035495758057, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 977.80810546875, "completions/mean_terminated_length": 779.6243286132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7381599275478131, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13983331108529778, "kl": 0.028472900390625, "learning_rate": 2.7560386596967553e-07, "loss": 0.1025, "num_tokens": 1906061825.0, "reward": 2.4637277126312256, "reward_std": 0.4490770697593689, "rewards/accuracy_reward/mean": 0.5902777910232544, "rewards/accuracy_reward/std": 0.49235257506370544, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16202956438064575, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 911.1116333007812, "completions/mean_terminated_length": 731.9121704101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.738373022215119, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13964961402269552, "kl": 0.0283203125, "learning_rate": 2.7533861343022325e-07, "loss": 0.0334, "num_tokens": 1906541235.0, "reward": 2.3426339626312256, "reward_std": 0.3650625944137573, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13295157253742218, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1166.83935546875, "completions/mean_terminated_length": 883.5162353515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.738586116882425, "frac_reward_zero_std": 0.0, "grad_norm": 0.12765069854697872, "kl": 0.02410888671875, "learning_rate": 2.7507351289482495e-07, "loss": 0.0736, "num_tokens": 1907134811.0, "reward": 2.345982313156128, "reward_std": 0.5033276677131653, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1695971041917801, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 904.8370971679688, "completions/mean_terminated_length": 748.159912109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7387992115497309, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1440952967123607, "kl": 0.031402587890625, "learning_rate": 2.7480856451019334e-07, "loss": 0.0952, "num_tokens": 1907608722.0, "reward": 2.583705425262451, "reward_std": 0.40793558955192566, "rewards/accuracy_reward/mean": 0.6763392686843872, "rewards/accuracy_reward/std": 0.46839532256126404, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.11046778410673141, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1015.94873046875, "completions/mean_terminated_length": 770.7651977539062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.739012306217037, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1221198046870945, "kl": 0.02581787109375, "learning_rate": 2.7454376842295725e-07, "loss": 0.0641, "num_tokens": 1908132091.0, "reward": 2.377232313156128, "reward_std": 0.3425656259059906, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09329447150230408, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 828.7611694335938, "completions/mean_terminated_length": 675.5904541015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7392254008843429, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14017910963868296, "kl": 0.0318603515625, "learning_rate": 2.742791247796609e-07, "loss": 0.0445, "num_tokens": 1908568432.0, "reward": 2.5206475257873535, "reward_std": 0.363179087638855, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11317373067140579, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 997.0067138671875, "completions/mean_terminated_length": 805.6649169921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7394384955516489, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13436998160381114, "kl": 0.027130126953125, "learning_rate": 2.7401463372676435e-07, "loss": 0.1046, "num_tokens": 1909088995.0, "reward": 2.3783483505249023, "reward_std": 0.4117995798587799, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14969991147518158, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1033.384033203125, "completions/mean_terminated_length": 832.6310424804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7396515902189548, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11285202153271146, "kl": 0.025726318359375, "learning_rate": 2.7375029541064304e-07, "loss": 0.0455, "num_tokens": 1909618719.0, "reward": 2.3984375, "reward_std": 0.35587483644485474, "rewards/accuracy_reward/mean": 0.4791666567325592, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.10836843401193619, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1078.0491943359375, "completions/mean_terminated_length": 784.8081665039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7398646848862607, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.122578830556465, "kl": 0.024749755859375, "learning_rate": 2.7348610997758827e-07, "loss": 0.1056, "num_tokens": 1910172837.0, "reward": 2.369419813156128, "reward_std": 0.42102834582328796, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15645259618759155, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 998.279052734375, "completions/mean_terminated_length": 776.9865112304688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7400777795535667, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1319045417106927, "kl": 0.02923583984375, "learning_rate": 2.7322207757380624e-07, "loss": 0.0872, "num_tokens": 1910697042.0, "reward": 2.384486675262451, "reward_std": 0.4264853894710541, "rewards/accuracy_reward/mean": 0.5162037014961243, "rewards/accuracy_reward/std": 0.5003167390823364, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16169020533561707, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 879.4219360351562, "completions/mean_terminated_length": 732.6155395507812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7402908742208726, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13779882468186908, "kl": 0.03167724609375, "learning_rate": 2.729581983454187e-07, "loss": 0.0398, "num_tokens": 1911157215.0, "reward": 2.4536831378936768, "reward_std": 0.3912586569786072, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13903217017650604, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1032.763427734375, "completions/mean_terminated_length": 770.3988647460938, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.7405039688881786, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12311671928460928, "kl": 0.027923583984375, "learning_rate": 2.726944724384627e-07, "loss": 0.08, "num_tokens": 1911691781.0, "reward": 2.361049175262451, "reward_std": 0.37111398577690125, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661912620067596, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 972.8504638671875, "completions/mean_terminated_length": 753.1962280273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7407170635554845, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13943072372803567, "kl": 0.027130126953125, "learning_rate": 2.724308999988901e-07, "loss": 0.0328, "num_tokens": 1912191234.0, "reward": 2.3895089626312256, "reward_std": 0.4337301552295685, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16796617209911346, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1107.1741943359375, "completions/mean_terminated_length": 822.7383422851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7409301582227905, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1308343998283261, "kl": 0.02545166015625, "learning_rate": 2.721674811725686e-07, "loss": 0.1172, "num_tokens": 1912751440.0, "reward": 2.3448662757873535, "reward_std": 0.4103396534919739, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1043.40625, "completions/mean_terminated_length": 801.3019409179688, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.7411432528900964, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1365424929180653, "kl": 0.02545166015625, "learning_rate": 2.719042161052796e-07, "loss": 0.0948, "num_tokens": 1913290118.0, "reward": 2.3895089626312256, "reward_std": 0.5162388682365417, "rewards/accuracy_reward/mean": 0.5185185074806213, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 888.8214721679688, "completions/mean_terminated_length": 713.0076904296875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7413563475574024, "frac_reward_zero_std": 0.25, "grad_norm": 0.1117017238388437, "kl": 0.029327392578125, "learning_rate": 2.716411049427209e-07, "loss": 0.039, "num_tokens": 1913757734.0, "reward": 2.458705425262451, "reward_std": 0.32308757305145264, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.08887429535388947, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1076.3660888671875, "completions/mean_terminated_length": 775.2163696289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7415694422247083, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.5592069382020158, "kl": 0.122894287109375, "learning_rate": 2.7137814783050383e-07, "loss": 0.0815, "num_tokens": 1914314762.0, "reward": 2.42578125, "reward_std": 0.42657729983329773, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661914110183716, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1071.109375, "completions/mean_terminated_length": 808.206787109375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7417825368920142, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11953810062825712, "kl": 0.027099609375, "learning_rate": 2.711153449141554e-07, "loss": 0.0586, "num_tokens": 1914865611.0, "reward": 2.41796875, "reward_std": 0.4024464786052704, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1312635987997055, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 915.8326416015625, "completions/mean_terminated_length": 757.3867797851562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7419956315593202, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14218067422316027, "kl": 0.030426025390625, "learning_rate": 2.708526963391167e-07, "loss": 0.0845, "num_tokens": 1915347024.0, "reward": 2.537388563156128, "reward_std": 0.388633131980896, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11660737544298172, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 985.4241333007812, "completions/mean_terminated_length": 781.9520874023438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7422087262266261, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12886452836336423, "kl": 0.02740478515625, "learning_rate": 2.7059020225074354e-07, "loss": 0.0613, "num_tokens": 1915853070.0, "reward": 2.447544813156128, "reward_std": 0.4652882516384125, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16680268943309784, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1034.6116943359375, "completions/mean_terminated_length": 820.9783935546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7424218208939322, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12501776245645774, "kl": 0.025848388671875, "learning_rate": 2.7032786279430656e-07, "loss": 0.0596, "num_tokens": 1916387424.0, "reward": 2.3275671005249023, "reward_std": 0.4162757098674774, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 990.0535888671875, "completions/mean_terminated_length": 851.1312866210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7426349155612381, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1328114569763756, "kl": 0.028076171875, "learning_rate": 2.7006567811499047e-07, "loss": 0.0987, "num_tokens": 1916900024.0, "reward": 2.454799175262451, "reward_std": 0.4706345796585083, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1134.2857666015625, "completions/mean_terminated_length": 826.07763671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7428480102285441, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12279710045297361, "kl": 0.025299072265625, "learning_rate": 2.6980364835789444e-07, "loss": 0.109, "num_tokens": 1917482056.0, "reward": 2.239955425262451, "reward_std": 0.527755081653595, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15283602476119995, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 952.9063110351562, "completions/mean_terminated_length": 770.390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.74306110489585, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13688211120565447, "kl": 0.03094482421875, "learning_rate": 2.695417736680318e-07, "loss": 0.0976, "num_tokens": 1917976862.0, "reward": 2.4871652126312256, "reward_std": 0.4360974431037903, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1061.4576416015625, "completions/mean_terminated_length": 827.085693359375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7432741995631559, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12300436453475487, "kl": 0.025787353515625, "learning_rate": 2.692800541903302e-07, "loss": 0.0812, "num_tokens": 1918521307.0, "reward": 2.4676339626312256, "reward_std": 0.5184011459350586, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15063102543354034, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 936.1451416015625, "completions/mean_terminated_length": 719.7039794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7434872942304619, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12759943402613363, "kl": 0.02984619140625, "learning_rate": 2.690184900696313e-07, "loss": 0.0449, "num_tokens": 1919007596.0, "reward": 2.5184152126312256, "reward_std": 0.3949706256389618, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14869897067546844, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 954.7656860351562, "completions/mean_terminated_length": 734.9464111328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7437003888977678, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14106229739273263, "kl": 0.027740478515625, "learning_rate": 2.6875708145069065e-07, "loss": 0.0905, "num_tokens": 1919501107.0, "reward": 2.372767925262451, "reward_std": 0.484919011592865, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16456104815006256, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1061.72998046875, "completions/mean_terminated_length": 847.3233642578125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7439134835650738, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1163484796512355, "kl": 0.026763916015625, "learning_rate": 2.6849582847817843e-07, "loss": 0.0334, "num_tokens": 1920046554.0, "reward": 2.4464287757873535, "reward_std": 0.39093706011772156, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11805575340986252, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1114.0045166015625, "completions/mean_terminated_length": 824.5204467773438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7441265782323797, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11948643836840166, "kl": 0.025848388671875, "learning_rate": 2.6823473129667787e-07, "loss": 0.033, "num_tokens": 1920625516.0, "reward": 2.2081475257873535, "reward_std": 0.4602033495903015, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1787492334842682, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 818.4777221679688, "completions/mean_terminated_length": 701.2371826171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7443396728996857, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1299716580565299, "kl": 0.035369873046875, "learning_rate": 2.6797379005068594e-07, "loss": 0.0457, "num_tokens": 1921053490.0, "reward": 2.5736608505249023, "reward_std": 0.37030670046806335, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12782442569732666, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 967.810302734375, "completions/mean_terminated_length": 743.6199340820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7445527675669916, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2883007918642504, "kl": 0.0340576171875, "learning_rate": 2.6771300488461405e-07, "loss": 0.1406, "num_tokens": 1921561917.0, "reward": 2.4598214626312256, "reward_std": 0.4820263981819153, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16899244487285614, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1110.05810546875, "completions/mean_terminated_length": 786.1441650390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7447658622342976, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.3346360506286928, "kl": 0.027191162109375, "learning_rate": 2.674523759427867e-07, "loss": 0.1122, "num_tokens": 1922136967.0, "reward": 2.3286831378936768, "reward_std": 0.427215039730072, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1454136222600937, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1096.4754638671875, "completions/mean_terminated_length": 815.9682006835938, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7449789569016035, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11798150624817462, "kl": 0.02471923828125, "learning_rate": 2.671919033694423e-07, "loss": 0.0542, "num_tokens": 1922697436.0, "reward": 2.431361675262451, "reward_std": 0.44612422585487366, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813789844513, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1071.712158203125, "completions/mean_terminated_length": 805.4517211914062, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7451920515689094, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12965966948036092, "kl": 0.025634765625, "learning_rate": 2.66931587308732e-07, "loss": 0.0701, "num_tokens": 1923246539.0, "reward": 2.4441964626312256, "reward_std": 0.4448496103286743, "rewards/accuracy_reward/mean": 0.5671296119689941, "rewards/accuracy_reward/std": 0.4960475564002991, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13271193206310272, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1039.40185546875, "completions/mean_terminated_length": 796.3323974609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7454051462362155, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13061103848622702, "kl": 0.026519775390625, "learning_rate": 2.6667142790472116e-07, "loss": 0.0979, "num_tokens": 1923783503.0, "reward": 2.36328125, "reward_std": 0.401105672121048, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1037.8125, "completions/mean_terminated_length": 773.1718139648438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7456182409035214, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12107353137447877, "kl": 0.02606201171875, "learning_rate": 2.6641142530138814e-07, "loss": 0.0749, "num_tokens": 1924321595.0, "reward": 2.3744421005249023, "reward_std": 0.397959440946579, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519908487796783, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1033.0513916015625, "completions/mean_terminated_length": 841.9071655273438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7458313355708274, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11664976188410903, "kl": 0.025299072265625, "learning_rate": 2.6615157964262436e-07, "loss": 0.0846, "num_tokens": 1924852562.0, "reward": 2.454799175262451, "reward_std": 0.42508262395858765, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 978.62060546875, "completions/mean_terminated_length": 773.845703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7460444302381333, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13066019456290967, "kl": 0.029022216796875, "learning_rate": 2.658918910722344e-07, "loss": 0.0762, "num_tokens": 1925358536.0, "reward": 2.4268975257873535, "reward_std": 0.4405536949634552, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12909629940986633, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 983.0803833007812, "completions/mean_terminated_length": 733.718994140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7462575249054393, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1397188026506883, "kl": 0.028839111328125, "learning_rate": 2.656323597339361e-07, "loss": 0.112, "num_tokens": 1925873756.0, "reward": 2.5518975257873535, "reward_std": 0.437661737203598, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1246887594461441, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1036.415283203125, "completions/mean_terminated_length": 782.1061401367188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7464706195727452, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13525837584867187, "kl": 0.028045654296875, "learning_rate": 2.653729857713604e-07, "loss": 0.0875, "num_tokens": 1926412134.0, "reward": 2.357701063156128, "reward_std": 0.4112618565559387, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15551921725273132, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 955.9777221679688, "completions/mean_terminated_length": 729.33154296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7466837142400511, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14044591093172423, "kl": 0.02783203125, "learning_rate": 2.651137693280506e-07, "loss": 0.1089, "num_tokens": 1926910364.0, "reward": 2.439732313156128, "reward_std": 0.4063902199268341, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1131.841552734375, "completions/mean_terminated_length": 904.7158813476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7468968089073571, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11957806596376697, "kl": 0.022857666015625, "learning_rate": 2.6485471054746315e-07, "loss": 0.0627, "num_tokens": 1927487765.0, "reward": 2.404017925262451, "reward_std": 0.47729891538619995, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 854.4710083007812, "completions/mean_terminated_length": 640.8921508789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.747109903574663, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13722161850513545, "kl": 0.03155517578125, "learning_rate": 2.6459580957296757e-07, "loss": 0.0681, "num_tokens": 1927931800.0, "reward": 2.40625, "reward_std": 0.3701859414577484, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09585530310869217, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1142.013427734375, "completions/mean_terminated_length": 885.0143432617188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.747322998241969, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11558254378589872, "kl": 0.024444580078125, "learning_rate": 2.6433706654784555e-07, "loss": 0.0318, "num_tokens": 1928523918.0, "reward": 2.4564733505249023, "reward_std": 0.40328356623649597, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11752051115036011, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 927.4107666015625, "completions/mean_terminated_length": 733.801025390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7475360929092749, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1312303597305569, "kl": 0.0301513671875, "learning_rate": 2.640784816152916e-07, "loss": 0.0414, "num_tokens": 1929010150.0, "reward": 2.5066964626312256, "reward_std": 0.4345788359642029, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 918.7902221679688, "completions/mean_terminated_length": 747.5218505859375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7477491875765809, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12394669621573583, "kl": 0.03009033203125, "learning_rate": 2.6382005491841244e-07, "loss": 0.0181, "num_tokens": 1929489688.0, "reward": 2.3878350257873535, "reward_std": 0.39658868312835693, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11614610999822617, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 885.904052734375, "completions/mean_terminated_length": 719.8903198242188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7479622822438868, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.5240799943464431, "kl": 0.054046630859375, "learning_rate": 2.635617866002278e-07, "loss": 0.0685, "num_tokens": 1929960253.0, "reward": 2.4871652126312256, "reward_std": 0.4223470091819763, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12242549657821655, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 880.5535888671875, "completions/mean_terminated_length": 671.6421508789062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7481753769111928, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.1479007680914857, "kl": 0.030303955078125, "learning_rate": 2.633036768036695e-07, "loss": 0.0914, "num_tokens": 1930423189.0, "reward": 2.400111675262451, "reward_std": 0.3164609670639038, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813715338707, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1149.5960693359375, "completions/mean_terminated_length": 850.1279907226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7483884715784987, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12626343790534925, "kl": 0.025054931640625, "learning_rate": 2.6304572567158105e-07, "loss": 0.137, "num_tokens": 1931011232.0, "reward": 2.3565850257873535, "reward_std": 0.4179708659648895, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550675868988, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15266746282577515, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 1029.2567138671875, "completions/mean_terminated_length": 758.742919921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7486015662458047, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12138239803794859, "kl": 0.025634765625, "learning_rate": 2.627879333467191e-07, "loss": 0.0658, "num_tokens": 1931544803.0, "reward": 2.39453125, "reward_std": 0.42185673117637634, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15968577563762665, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1019.8460083007812, "completions/mean_terminated_length": 768.5194702148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7488146609131107, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11616291493775444, "kl": 0.02825927734375, "learning_rate": 2.6253029997175186e-07, "loss": 0.0531, "num_tokens": 1932068110.0, "reward": 2.3448662757873535, "reward_std": 0.3450391888618469, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 857.2656860351562, "completions/mean_terminated_length": 680.1820678710938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7490277555804166, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1265011559597205, "kl": 0.028167724609375, "learning_rate": 2.622728256892597e-07, "loss": 0.0753, "num_tokens": 1932525749.0, "reward": 2.560826063156128, "reward_std": 0.3422622084617615, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.485796183347702, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523807287216, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 996.3035888671875, "completions/mean_terminated_length": 771.1436767578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7492408502477226, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13357169768061244, "kl": 0.026824951171875, "learning_rate": 2.620155106417348e-07, "loss": 0.0652, "num_tokens": 1933045437.0, "reward": 2.501674175262451, "reward_std": 0.4418174624443054, "rewards/accuracy_reward/mean": 0.6157407164573669, "rewards/accuracy_reward/std": 0.48698359727859497, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 945.1094360351562, "completions/mean_terminated_length": 781.0897827148438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7494539449150285, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1338695465535572, "kl": 0.029541015625, "learning_rate": 2.6175835497158125e-07, "loss": 0.0831, "num_tokens": 1933533438.0, "reward": 2.482701063156128, "reward_std": 0.4090162217617035, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801211535930634, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 859.3192138671875, "completions/mean_terminated_length": 689.5076293945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7496670395823345, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1504201522100094, "kl": 0.032867431640625, "learning_rate": 2.6150135882111544e-07, "loss": 0.0933, "num_tokens": 1933985309.0, "reward": 2.532924175262451, "reward_std": 0.42589065432548523, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1116.1607666015625, "completions/mean_terminated_length": 862.0227661132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7498801342496404, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2604814505094918, "kl": 0.025360107421875, "learning_rate": 2.612445223325648e-07, "loss": 0.0593, "num_tokens": 1934567189.0, "reward": 2.3136162757873535, "reward_std": 0.44190889596939087, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1549774408340454, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1072.0067138671875, "completions/mean_terminated_length": 812.8446044921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7500932289169464, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12756135792043935, "kl": 0.026397705078125, "learning_rate": 2.6098784564806875e-07, "loss": 0.1015, "num_tokens": 1935121880.0, "reward": 2.3705358505249023, "reward_std": 0.4008837640285492, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 901.2723388671875, "completions/mean_terminated_length": 663.272216796875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7503063235842523, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14866352343964825, "kl": 0.034820556640625, "learning_rate": 2.607313289096779e-07, "loss": 0.0479, "num_tokens": 1935590114.0, "reward": 2.439732313156128, "reward_std": 0.3268306255340576, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1082.5648193359375, "completions/mean_terminated_length": 853.2072143554688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7505194182515582, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11294717013774547, "kl": 0.024505615234375, "learning_rate": 2.6047497225935523e-07, "loss": 0.0836, "num_tokens": 1936151231.0, "reward": 2.4229912757873535, "reward_std": 0.39541906118392944, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1130.399658203125, "completions/mean_terminated_length": 831.772216796875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7507325129188642, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1206857297800884, "kl": 0.026702880859375, "learning_rate": 2.60218775838974e-07, "loss": 0.0567, "num_tokens": 1936725042.0, "reward": 2.306361675262451, "reward_std": 0.48713958263397217, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.18550105392932892, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 983.90185546875, "completions/mean_terminated_length": 738.3406982421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7509456075861701, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14549911823578737, "kl": 0.02783203125, "learning_rate": 2.599627397903193e-07, "loss": 0.1225, "num_tokens": 1937240182.0, "reward": 2.4296875, "reward_std": 0.5068896412849426, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15465489029884338, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1045.09375, "completions/mean_terminated_length": 753.1815185546875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7511587022534761, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13782383556429997, "kl": 0.027618408203125, "learning_rate": 2.5970686425508783e-07, "loss": 0.0618, "num_tokens": 1937778736.0, "reward": 2.36328125, "reward_std": 0.4292701482772827, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11734377592802048, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 945.8594360351562, "completions/mean_terminated_length": 731.309326171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.751371796920782, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13754740482309039, "kl": 0.0283203125, "learning_rate": 2.59451149374887e-07, "loss": 0.046, "num_tokens": 1938268353.0, "reward": 2.4135046005249023, "reward_std": 0.3300292193889618, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10940459370613098, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 919.58935546875, "completions/mean_terminated_length": 777.8291625976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.751584891588088, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1174617002064403, "kl": 0.028533935546875, "learning_rate": 2.591955952912353e-07, "loss": 0.0473, "num_tokens": 1938753241.0, "reward": 2.5870537757873535, "reward_std": 0.4012986719608307, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.10325382649898529, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1010.7902221679688, "completions/mean_terminated_length": 798.8870849609375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.751797986255394, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12752559118865228, "kl": 0.025482177734375, "learning_rate": 2.5894020214556246e-07, "loss": 0.0881, "num_tokens": 1939279755.0, "reward": 2.3482143878936768, "reward_std": 0.4496425986289978, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15220166742801666, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 912.779052734375, "completions/mean_terminated_length": 720.1174926757812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7520110809226999, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1424765568988169, "kl": 0.034393310546875, "learning_rate": 2.5868497007920887e-07, "loss": 0.107, "num_tokens": 1939766936.0, "reward": 2.4654018878936768, "reward_std": 0.3953595459461212, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.09196697175502777, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1044.4710693359375, "completions/mean_terminated_length": 812.8873901367188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7522241755900059, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11476163690264074, "kl": 0.02606201171875, "learning_rate": 2.584298992334263e-07, "loss": 0.0567, "num_tokens": 1940305547.0, "reward": 2.4408483505249023, "reward_std": 0.4668347239494324, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14744803309440613, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1118.15185546875, "completions/mean_terminated_length": 847.5042724609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7524372702573118, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12287073235266664, "kl": 0.028106689453125, "learning_rate": 2.5817498974937654e-07, "loss": 0.0793, "num_tokens": 1940875615.0, "reward": 2.2583706378936768, "reward_std": 0.37075507640838623, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1043.5625, "completions/mean_terminated_length": 808.3636474609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7526503649246178, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11882026885742407, "kl": 0.025421142578125, "learning_rate": 2.579202417681328e-07, "loss": 0.0685, "num_tokens": 1941408027.0, "reward": 2.282924175262451, "reward_std": 0.38527238368988037, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13099703192710876, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1056.9576416015625, "completions/mean_terminated_length": 844.783203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7528634595919237, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11700048784282295, "kl": 0.0260009765625, "learning_rate": 2.576656554306783e-07, "loss": 0.0528, "num_tokens": 1941954696.0, "reward": 2.423549175262451, "reward_std": 0.398449569940567, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1480342447757721, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1007.8906860351562, "completions/mean_terminated_length": 795.3951416015625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7530765542592297, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1280614576504268, "kl": 0.0267333984375, "learning_rate": 2.5741123087790734e-07, "loss": 0.1171, "num_tokens": 1942473463.0, "reward": 2.41015625, "reward_std": 0.41179290413856506, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692909002304, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1018.8660888671875, "completions/mean_terminated_length": 808.6129150390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7532896489265356, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14203926026606373, "kl": 0.02801513671875, "learning_rate": 2.5715696825062426e-07, "loss": 0.1109, "num_tokens": 1942999051.0, "reward": 2.3939733505249023, "reward_std": 0.4207933843135834, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.11730785667896271, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1063.3616943359375, "completions/mean_terminated_length": 801.9039306640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7535027435938416, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12285770090197136, "kl": 0.025726318359375, "learning_rate": 2.5690286768954395e-07, "loss": 0.038, "num_tokens": 1943545357.0, "reward": 2.4871652126312256, "reward_std": 0.4131576120853424, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13124457001686096, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 954.2589721679688, "completions/mean_terminated_length": 771.96875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7537158382611475, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14278764091455756, "kl": 0.030029296875, "learning_rate": 2.566489293352918e-07, "loss": 0.0936, "num_tokens": 1944039169.0, "reward": 2.51953125, "reward_std": 0.4585893452167511, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1247187927365303, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1148.265625, "completions/mean_terminated_length": 879.6492919921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7539289329284534, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11359663314860509, "kl": 0.021759033203125, "learning_rate": 2.5639515332840324e-07, "loss": 0.0834, "num_tokens": 1944627192.0, "reward": 2.3515625, "reward_std": 0.38736072182655334, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4966535270214081, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11752051115036011, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1099.57373046875, "completions/mean_terminated_length": 844.3314208984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7541420275957594, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11785983211313968, "kl": 0.02374267578125, "learning_rate": 2.5614153980932366e-07, "loss": 0.0596, "num_tokens": 1945191561.0, "reward": 2.4291296005249023, "reward_std": 0.4149930775165558, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10092891752719879, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1093.9888916015625, "completions/mean_terminated_length": 775.985107421875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7543551222630653, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11803431143361388, "kl": 0.02618408203125, "learning_rate": 2.5588808891840897e-07, "loss": 0.068, "num_tokens": 1945748212.0, "reward": 2.4681921005249023, "reward_std": 0.4166988134384155, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12428762763738632, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1012.341552734375, "completions/mean_terminated_length": 790.6151733398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7545682169303713, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13351143303602822, "kl": 0.027618408203125, "learning_rate": 2.556348007959246e-07, "loss": 0.0415, "num_tokens": 1946281181.0, "reward": 2.4330358505249023, "reward_std": 0.38546356558799744, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1033746674656868, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1020.3906860351562, "completions/mean_terminated_length": 813.7667846679688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7547813115976773, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14625910080746216, "kl": 0.030517578125, "learning_rate": 2.5538167558204625e-07, "loss": 0.0693, "num_tokens": 1946807692.0, "reward": 2.466517925262451, "reward_std": 0.4417068362236023, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11686538904905319, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 991.0870971679688, "completions/mean_terminated_length": 801.955322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7549944062649833, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1276555336301265, "kl": 0.02886962890625, "learning_rate": 2.551287134168593e-07, "loss": 0.0579, "num_tokens": 1947321491.0, "reward": 2.385044813156128, "reward_std": 0.44310033321380615, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14876298606395721, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1056.5670166015625, "completions/mean_terminated_length": 803.8487548828125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7552075009322892, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1283431741677651, "kl": 0.02557373046875, "learning_rate": 2.548759144403591e-07, "loss": 0.0955, "num_tokens": 1947871825.0, "reward": 2.2896206378936768, "reward_std": 0.41937577724456787, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15122967958450317, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1091.5023193359375, "completions/mean_terminated_length": 851.0418701171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7554205955995951, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13434317132116813, "kl": 0.025634765625, "learning_rate": 2.5462327879245064e-07, "loss": 0.068, "num_tokens": 1948429394.0, "reward": 2.4224331378936768, "reward_std": 0.5017857551574707, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1032.888427734375, "completions/mean_terminated_length": 835.2799682617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7556336902669011, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13709049878649024, "kl": 0.026947021484375, "learning_rate": 2.5437080661294785e-07, "loss": 0.0992, "num_tokens": 1948959376.0, "reward": 2.3666296005249023, "reward_std": 0.4290775656700134, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12338031083345413, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 984.4107666015625, "completions/mean_terminated_length": 753.1956787109375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.755846784934207, "frac_reward_zero_std": 0.25, "grad_norm": 0.12042122035549503, "kl": 0.02716064453125, "learning_rate": 2.5411849804157524e-07, "loss": 0.0659, "num_tokens": 1949464808.0, "reward": 2.396205425262451, "reward_std": 0.30729636549949646, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14409084618091583, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 952.1763916015625, "completions/mean_terminated_length": 776.1632080078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.756059879601513, "frac_reward_zero_std": 0.0, "grad_norm": 0.1354007590023983, "kl": 0.030548095703125, "learning_rate": 2.5386635321796613e-07, "loss": 0.0733, "num_tokens": 1949961575.0, "reward": 2.4810268878936768, "reward_std": 0.4675096273422241, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14819122850894928, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1005.5625610351562, "completions/mean_terminated_length": 782.3848266601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7562729742688189, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1397903050341739, "kl": 0.026397705078125, "learning_rate": 2.536143722816636e-07, "loss": 0.1048, "num_tokens": 1950485955.0, "reward": 2.3080358505249023, "reward_std": 0.42131131887435913, "rewards/accuracy_reward/mean": 0.42824074625968933, "rewards/accuracy_reward/std": 0.4953974783420563, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13840332627296448, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 972.93310546875, "completions/mean_terminated_length": 790.4804077148438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7564860689361249, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1291851724264226, "kl": 0.0277099609375, "learning_rate": 2.5336255537211935e-07, "loss": 0.0408, "num_tokens": 1950995205.0, "reward": 2.4068081378936768, "reward_std": 0.4471297860145569, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13017487525939941, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 948.122802734375, "completions/mean_terminated_length": 747.8812866210938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7566991636034308, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13055931442172414, "kl": 0.02972412109375, "learning_rate": 2.531109026286952e-07, "loss": 0.0468, "num_tokens": 1951492700.0, "reward": 2.4559152126312256, "reward_std": 0.452176570892334, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1395072489976883, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1001.3928833007812, "completions/mean_terminated_length": 784.1724853515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7569122582707368, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12868678362427405, "kl": 0.028961181640625, "learning_rate": 2.5285941419066155e-07, "loss": 0.093, "num_tokens": 1952006332.0, "reward": 2.5541296005249023, "reward_std": 0.44663339853286743, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.13750630617141724, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1061.3125, "completions/mean_terminated_length": 795.7733764648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7571253529380427, "frac_reward_zero_std": 0.0, "grad_norm": 0.13779584654039653, "kl": 0.028656005859375, "learning_rate": 2.5260809019719794e-07, "loss": 0.1017, "num_tokens": 1952548792.0, "reward": 2.2622768878936768, "reward_std": 0.5183950662612915, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1634153127670288, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1018.90185546875, "completions/mean_terminated_length": 774.419921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7573384476053486, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12245419355886863, "kl": 0.028900146484375, "learning_rate": 2.5235693078739304e-07, "loss": 0.0375, "num_tokens": 1953078508.0, "reward": 2.4732143878936768, "reward_std": 0.3816849887371063, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13165414333343506, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 985.1406860351562, "completions/mean_terminated_length": 788.3148193359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7575515422726546, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12065835306994759, "kl": 0.02655029296875, "learning_rate": 2.521059361002441e-07, "loss": 0.0941, "num_tokens": 1953588443.0, "reward": 2.3130581378936768, "reward_std": 0.4270966947078705, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13170984387397766, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1100.384033203125, "completions/mean_terminated_length": 845.3597412109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7577646369399605, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12398138916722021, "kl": 0.02642822265625, "learning_rate": 2.51855106274658e-07, "loss": 0.0533, "num_tokens": 1954154871.0, "reward": 2.3716518878936768, "reward_std": 0.38874921202659607, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15208269655704498, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1026.6920166015625, "completions/mean_terminated_length": 791.0054931640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7579777316072666, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13494639588625432, "kl": 0.026153564453125, "learning_rate": 2.5160444144944936e-07, "loss": 0.0969, "num_tokens": 1954701933.0, "reward": 2.412388563156128, "reward_std": 0.4836609363555908, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13878051936626434, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1043.7410888671875, "completions/mean_terminated_length": 766.2108154296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7581908262745725, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.6311853281922855, "kl": 0.05853271484375, "learning_rate": 2.51353941763342e-07, "loss": 0.1118, "num_tokens": 1955242793.0, "reward": 2.404576063156128, "reward_std": 0.47136369347572327, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16824373602867126, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1031.071533203125, "completions/mean_terminated_length": 789.480712890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7584039209418785, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16113232936407149, "kl": 0.028411865234375, "learning_rate": 2.511036073549687e-07, "loss": 0.0945, "num_tokens": 1955779145.0, "reward": 2.3353796005249023, "reward_std": 0.4416998624801636, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14827017486095428, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 958.544677734375, "completions/mean_terminated_length": 746.4639892578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7586170156091844, "frac_reward_zero_std": 0.25, "grad_norm": 0.14311874398246424, "kl": 0.02728271484375, "learning_rate": 2.508534383628701e-07, "loss": 0.1244, "num_tokens": 1956275597.0, "reward": 2.4916296005249023, "reward_std": 0.37237876653671265, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1222418025135994, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1029.08935546875, "completions/mean_terminated_length": 793.9560546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7588301102764903, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12746763745529244, "kl": 0.027496337890625, "learning_rate": 2.506034349254956e-07, "loss": 0.0477, "num_tokens": 1956804597.0, "reward": 2.4659600257873535, "reward_std": 0.47420525550842285, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.1539783626794815, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 926.5625610351562, "completions/mean_terminated_length": 753.144287109375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7590432049437963, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13528317209219723, "kl": 0.029449462890625, "learning_rate": 2.50353597181203e-07, "loss": 0.0263, "num_tokens": 1957292385.0, "reward": 2.4609375, "reward_std": 0.42423495650291443, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14213687181472778, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 937.40185546875, "completions/mean_terminated_length": 724.7340087890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.7592562996111022, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.131637857665679, "kl": 0.026336669921875, "learning_rate": 2.5010392526825845e-07, "loss": 0.097, "num_tokens": 1957781733.0, "reward": 2.515625, "reward_std": 0.36496657133102417, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11409792304039001, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 945.8170166015625, "completions/mean_terminated_length": 702.5558471679688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7594693942784082, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1315357767767974, "kl": 0.029296875, "learning_rate": 2.498544193248363e-07, "loss": 0.0655, "num_tokens": 1958273651.0, "reward": 2.4681921005249023, "reward_std": 0.4260401129722595, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13776934146881104, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 936.8303833007812, "completions/mean_terminated_length": 765.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7596824889457141, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13561159221762725, "kl": 0.029754638671875, "learning_rate": 2.496050794890186e-07, "loss": 0.0826, "num_tokens": 1958760647.0, "reward": 2.369419813156128, "reward_std": 0.4852680563926697, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1361066997051239, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1091.9241943359375, "completions/mean_terminated_length": 831.1761474609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7598955836130201, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11312988337358344, "kl": 0.023773193359375, "learning_rate": 2.4935590589879627e-07, "loss": 0.0704, "num_tokens": 1959319685.0, "reward": 2.3956475257873535, "reward_std": 0.39094406366348267, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14249980449676514, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1109.8326416015625, "completions/mean_terminated_length": 896.4959106445312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.760108678280326, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11541046608609773, "kl": 0.02362060546875, "learning_rate": 2.491068986920677e-07, "loss": 0.0463, "num_tokens": 1959895178.0, "reward": 2.3705358505249023, "reward_std": 0.4613640308380127, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1019.8817138671875, "completions/mean_terminated_length": 826.2572631835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.760321772947632, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1165216032864574, "kl": 0.02740478515625, "learning_rate": 2.4885805800663927e-07, "loss": 0.0316, "num_tokens": 1960423029.0, "reward": 2.3097100257873535, "reward_std": 0.3930506110191345, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14801737666130066, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1045.3616943359375, "completions/mean_terminated_length": 817.3643798828125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7605348676149379, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13967905785880028, "kl": 0.028167724609375, "learning_rate": 2.486093839802253e-07, "loss": 0.089, "num_tokens": 1960957991.0, "reward": 2.4838171005249023, "reward_std": 0.41798660159111023, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1049.296875, "completions/mean_terminated_length": 784.1044921875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.7607479622822438, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13017657702219562, "kl": 0.029144287109375, "learning_rate": 2.483608767504477e-07, "loss": 0.0269, "num_tokens": 1961489436.0, "reward": 2.345424175262451, "reward_std": 0.46885377168655396, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14078108966350555, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 962.7232666015625, "completions/mean_terminated_length": 781.84375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7609610569495499, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16573863707475095, "kl": 0.02923583984375, "learning_rate": 2.481125364548364e-07, "loss": 0.0867, "num_tokens": 1961992592.0, "reward": 2.470424175262451, "reward_std": 0.4638521671295166, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.1539783775806427, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 952.0022583007812, "completions/mean_terminated_length": 749.0396728515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7611741516168558, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12540751817174545, "kl": 0.027740478515625, "learning_rate": 2.478643632308287e-07, "loss": 0.0506, "num_tokens": 1962486625.0, "reward": 2.4068081378936768, "reward_std": 0.3216070532798767, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1119314506649971, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 955.22998046875, "completions/mean_terminated_length": 782.9844970703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7613872462841618, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.4512675117178448, "kl": 0.096282958984375, "learning_rate": 2.476163572157694e-07, "loss": 0.1696, "num_tokens": 1962989352.0, "reward": 2.494419813156128, "reward_std": 0.5137013792991638, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 935.58935546875, "completions/mean_terminated_length": 708.3225708007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7616003409514677, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.140295697425878, "kl": 0.0291748046875, "learning_rate": 2.4736851854691075e-07, "loss": 0.1019, "num_tokens": 1963470848.0, "reward": 2.373326063156128, "reward_std": 0.4100281298160553, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17818261682987213, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 999.38623046875, "completions/mean_terminated_length": 788.5388793945312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.7618134356187737, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1343333206956619, "kl": 0.028717041015625, "learning_rate": 2.47120847361413e-07, "loss": 0.0761, "num_tokens": 1963983117.0, "reward": 2.4425225257873535, "reward_std": 0.44747480750083923, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14294590055942535, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1120.5379638671875, "completions/mean_terminated_length": 870.9376831054688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7620265302860796, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11180280074402434, "kl": 0.025604248046875, "learning_rate": 2.4687334379634257e-07, "loss": 0.0514, "num_tokens": 1964559534.0, "reward": 2.3487725257873535, "reward_std": 0.43206730484962463, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14102916419506073, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 998.3147583007812, "completions/mean_terminated_length": 800.628662109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7622396249533855, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15631073540325244, "kl": 0.0303955078125, "learning_rate": 2.466260079886738e-07, "loss": 0.0805, "num_tokens": 1965079067.0, "reward": 2.4765625, "reward_std": 0.4737948477268219, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.158765509724617, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1006.4620971679688, "completions/mean_terminated_length": 807.0186157226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7624527196206915, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12244953831029336, "kl": 0.0260009765625, "learning_rate": 2.4637884007528856e-07, "loss": 0.0899, "num_tokens": 1965601466.0, "reward": 2.4458706378936768, "reward_std": 0.3801722824573517, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13669590651988983, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 860.5402221679688, "completions/mean_terminated_length": 704.611083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7626658142879974, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.17167211218465026, "kl": 0.02978515625, "learning_rate": 2.46131840192975e-07, "loss": 0.111, "num_tokens": 1966055004.0, "reward": 2.556361675262451, "reward_std": 0.396171510219574, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.126290425658226, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 959.5558471679688, "completions/mean_terminated_length": 744.1951904296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7628789089553034, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1243076404594816, "kl": 0.028167724609375, "learning_rate": 2.4588500847842886e-07, "loss": 0.058, "num_tokens": 1966553221.0, "reward": 2.4888393878936768, "reward_std": 0.40750470757484436, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09289216995239258, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1021.3214721679688, "completions/mean_terminated_length": 821.4613037109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7630920036226093, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1290515701931414, "kl": 0.025146484375, "learning_rate": 2.4563834506825254e-07, "loss": 0.0882, "num_tokens": 1967075525.0, "reward": 2.3582589626312256, "reward_std": 0.36674249172210693, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14649668335914612, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1036.1629638671875, "completions/mean_terminated_length": 778.2437133789062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7633050982899153, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13772930751318233, "kl": 0.026123046875, "learning_rate": 2.4539185009895514e-07, "loss": 0.1087, "num_tokens": 1967611214.0, "reward": 2.3286831378936768, "reward_std": 0.4171675443649292, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16573180258274078, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 882.7433471679688, "completions/mean_terminated_length": 716.278076171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7635181929572212, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13364333942123158, "kl": 0.030364990234375, "learning_rate": 2.451455237069532e-07, "loss": 0.0478, "num_tokens": 1968072859.0, "reward": 2.447544813156128, "reward_std": 0.3564707338809967, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765028238297, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1016.4107666015625, "completions/mean_terminated_length": 831.810546875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7637312876245272, "frac_reward_zero_std": 0.0, "grad_norm": 0.14727939565231107, "kl": 0.030487060546875, "learning_rate": 2.44899366028569e-07, "loss": 0.0676, "num_tokens": 1968600467.0, "reward": 2.4575893878936768, "reward_std": 0.49572864174842834, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14033812284469604, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1025.0513916015625, "completions/mean_terminated_length": 799.2778930664062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7639443822918331, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13558707228149589, "kl": 0.026275634765625, "learning_rate": 2.4465337720003237e-07, "loss": 0.0827, "num_tokens": 1969129754.0, "reward": 2.46875, "reward_std": 0.3993665277957916, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 884.5178833007812, "completions/mean_terminated_length": 669.0581665039062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.764157476959139, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1400686037826665, "kl": 0.031524658203125, "learning_rate": 2.444075573574792e-07, "loss": 0.0527, "num_tokens": 1969594274.0, "reward": 2.47265625, "reward_std": 0.38611456751823425, "rewards/accuracy_reward/mean": 0.5856481194496155, "rewards/accuracy_reward/std": 0.49318093061447144, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10346972197294235, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 991.46435546875, "completions/mean_terminated_length": 747.6483764648438, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7643705716264451, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1535987161073911, "kl": 0.0294189453125, "learning_rate": 2.441619066369519e-07, "loss": 0.1006, "num_tokens": 1970109090.0, "reward": 2.428013563156128, "reward_std": 0.5237821340560913, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.18275512754917145, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1048.872802734375, "completions/mean_terminated_length": 828.35693359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.764583666293751, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11458527636540096, "kl": 0.0247802734375, "learning_rate": 2.4391642517439935e-07, "loss": 0.0715, "num_tokens": 1970649385.0, "reward": 2.4419643878936768, "reward_std": 0.4425899088382721, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15971019864082336, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1065.8951416015625, "completions/mean_terminated_length": 855.6341552734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.764796760961057, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.5952769315013996, "kl": 0.038543701171875, "learning_rate": 2.436711131056767e-07, "loss": 0.0472, "num_tokens": 1971197722.0, "reward": 2.3504464626312256, "reward_std": 0.44625774025917053, "rewards/accuracy_reward/mean": 0.49537035822868347, "rewards/accuracy_reward/std": 0.5005581974983215, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15035313367843628, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 901.419677734375, "completions/mean_terminated_length": 706.830322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7650098556283629, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14981113750271696, "kl": 0.029571533203125, "learning_rate": 2.4342597056654555e-07, "loss": 0.081, "num_tokens": 1971670630.0, "reward": 2.4637277126312256, "reward_std": 0.3871533274650574, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1044.46875, "completions/mean_terminated_length": 777.9943237304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7652229502956689, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11603017677169247, "kl": 0.02740478515625, "learning_rate": 2.431809976926735e-07, "loss": 0.1155, "num_tokens": 1972206088.0, "reward": 2.368861675262451, "reward_std": 0.40957650542259216, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.16255265474319458, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 919.0245971679688, "completions/mean_terminated_length": 730.8619995117188, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7654360449629748, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1290785461980715, "kl": 0.029083251953125, "learning_rate": 2.429361946196342e-07, "loss": 0.072, "num_tokens": 1972683955.0, "reward": 2.5518975257873535, "reward_std": 0.36550506949424744, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11358113586902618, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1059.1451416015625, "completions/mean_terminated_length": 837.5983276367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7656491396302808, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.113915854628643, "kl": 0.02484130859375, "learning_rate": 2.426915614829074e-07, "loss": 0.0248, "num_tokens": 1973233492.0, "reward": 2.486049175262451, "reward_std": 0.42285555601119995, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.08974618464708328, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 981.7188110351562, "completions/mean_terminated_length": 777.5372314453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7658622342975867, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.130512319149885, "kl": 0.0283203125, "learning_rate": 2.424470984178789e-07, "loss": 0.0732, "num_tokens": 1973743910.0, "reward": 2.3917412757873535, "reward_std": 0.4645915925502777, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16545002162456512, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1091.4866943359375, "completions/mean_terminated_length": 837.4971923828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7660753289648926, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.10218472436779619, "kl": 0.024078369140625, "learning_rate": 2.422028055598403e-07, "loss": 0.0378, "num_tokens": 1974308464.0, "reward": 2.2466518878936768, "reward_std": 0.3809371292591095, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.14405618607997894, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1110.138427734375, "completions/mean_terminated_length": 823.0379028320312, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7662884236321986, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13361386804718603, "kl": 0.02593994140625, "learning_rate": 2.4195868304398877e-07, "loss": 0.0707, "num_tokens": 1974876334.0, "reward": 2.26953125, "reward_std": 0.44167739152908325, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1571800261735916, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 910.7723388671875, "completions/mean_terminated_length": 751.6183471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7665015182995045, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13812753928517843, "kl": 0.029541015625, "learning_rate": 2.417147310054277e-07, "loss": 0.0488, "num_tokens": 1975349432.0, "reward": 2.4229912757873535, "reward_std": 0.37660279870033264, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.13854306936264038, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1127.03125, "completions/mean_terminated_length": 862.3850708007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7667146129668105, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10273798755348457, "kl": 0.023468017578125, "learning_rate": 2.414709495791659e-07, "loss": 0.0227, "num_tokens": 1975931974.0, "reward": 2.3130581378936768, "reward_std": 0.40218326449394226, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801212280988693, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1145.8326416015625, "completions/mean_terminated_length": 845.110107421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7669277076341164, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11078919554477011, "kl": 0.024444580078125, "learning_rate": 2.4122733890011764e-07, "loss": 0.0522, "num_tokens": 1976512587.0, "reward": 2.3169643878936768, "reward_std": 0.4617672860622406, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14140157401561737, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1048.5513916015625, "completions/mean_terminated_length": 775.9744262695312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7671408023014225, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1176535199287874, "kl": 0.026336669921875, "learning_rate": 2.4098389910310264e-07, "loss": 0.0666, "num_tokens": 1977049506.0, "reward": 2.407924175262451, "reward_std": 0.48348382115364075, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.19128604233264923, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 896.857177734375, "completions/mean_terminated_length": 780.8943481445312, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7673538969687284, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14208384937400445, "kl": 0.029144287109375, "learning_rate": 2.4074063032284627e-07, "loss": 0.0963, "num_tokens": 1977517250.0, "reward": 2.642857313156128, "reward_std": 0.39701440930366516, "rewards/accuracy_reward/mean": 0.7098214030265808, "rewards/accuracy_reward/std": 0.4543519914150238, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.216333270072937, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10152243822813034, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 978.1116333007812, "completions/mean_terminated_length": 756.0592651367188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7675669916360343, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11383303283371794, "kl": 0.02655029296875, "learning_rate": 2.404975326939795e-07, "loss": 0.0617, "num_tokens": 1978025748.0, "reward": 2.4324777126312256, "reward_std": 0.40557077527046204, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11660738289356232, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1047.107177734375, "completions/mean_terminated_length": 795.4860229492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7677800863033403, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13607792291359277, "kl": 0.02569580078125, "learning_rate": 2.402546063510377e-07, "loss": 0.0957, "num_tokens": 1978571652.0, "reward": 2.38671875, "reward_std": 0.46462905406951904, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11601705849170685, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 924.0692138671875, "completions/mean_terminated_length": 722.9447631835938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7679931809706462, "frac_reward_zero_std": 0.25, "grad_norm": 0.12167819209875655, "kl": 0.0303955078125, "learning_rate": 2.4001185142846244e-07, "loss": 0.0769, "num_tokens": 1979046323.0, "reward": 2.486049175262451, "reward_std": 0.34442827105522156, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10572549700737, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1018.9576416015625, "completions/mean_terminated_length": 795.2527465820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7682062756379522, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1346863233587877, "kl": 0.0264892578125, "learning_rate": 2.3976926806059983e-07, "loss": 0.0861, "num_tokens": 1979566000.0, "reward": 2.3722100257873535, "reward_std": 0.3897438943386078, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16768629848957062, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1065.859375, "completions/mean_terminated_length": 835.8815307617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7684193703052581, "frac_reward_zero_std": 0.0, "grad_norm": 0.1352124405627495, "kl": 0.027069091796875, "learning_rate": 2.3952685638170127e-07, "loss": 0.0869, "num_tokens": 1980110545.0, "reward": 2.439174175262451, "reward_std": 0.4569382965564728, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15448829531669617, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 998.40185546875, "completions/mean_terminated_length": 810.5789794921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7686324649725641, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1266860043322466, "kl": 0.027923583984375, "learning_rate": 2.392846165259229e-07, "loss": 0.0715, "num_tokens": 1980626133.0, "reward": 2.4760046005249023, "reward_std": 0.5225756168365479, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1590748131275177, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1048.65185546875, "completions/mean_terminated_length": 837.9783935546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.76884555963987, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12133733303648937, "kl": 0.023956298828125, "learning_rate": 2.39042548627326e-07, "loss": 0.0443, "num_tokens": 1981171705.0, "reward": 2.4854912757873535, "reward_std": 0.48748332262039185, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1251746416091919, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1031.493408203125, "completions/mean_terminated_length": 757.9291381835938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.769058654307176, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14746039766802882, "kl": 0.0281982421875, "learning_rate": 2.3880065281987694e-07, "loss": 0.0897, "num_tokens": 1981698294.0, "reward": 2.365513563156128, "reward_std": 0.4907568097114563, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17125900089740753, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1045.915283203125, "completions/mean_terminated_length": 797.4874267578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7692717489744819, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11947018970142272, "kl": 0.025970458984375, "learning_rate": 2.3855892923744596e-07, "loss": 0.085, "num_tokens": 1982237968.0, "reward": 2.2896206378936768, "reward_std": 0.40791308879852295, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1422542929649353, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 963.310302734375, "completions/mean_terminated_length": 792.3385009765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7694848436417878, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13723790537981842, "kl": 0.031707763671875, "learning_rate": 2.3831737801380902e-07, "loss": 0.0772, "num_tokens": 1982736651.0, "reward": 2.435826063156128, "reward_std": 0.4453217387199402, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1339094191789627, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1078.477783203125, "completions/mean_terminated_length": 844.825439453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7696979383090938, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13967204528406374, "kl": 0.026153564453125, "learning_rate": 2.3807599928264607e-07, "loss": 0.1093, "num_tokens": 1983292049.0, "reward": 2.3058037757873535, "reward_std": 0.47787559032440186, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.16626667976379395, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 884.4285888671875, "completions/mean_terminated_length": 704.4948120117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7699110329763997, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14579676265594846, "kl": 0.0313720703125, "learning_rate": 2.3783479317754173e-07, "loss": 0.0969, "num_tokens": 1983761553.0, "reward": 2.4291296005249023, "reward_std": 0.4548097848892212, "rewards/accuracy_reward/mean": 0.5486111044883728, "rewards/accuracy_reward/std": 0.49820831418037415, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1454136222600937, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1099.102783203125, "completions/mean_terminated_length": 840.3125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7701241276437057, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14073691009687478, "kl": 0.02545166015625, "learning_rate": 2.375937598319852e-07, "loss": 0.1034, "num_tokens": 1984326639.0, "reward": 2.3236608505249023, "reward_std": 0.44813287258148193, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14619384706020355, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 977.1183471679688, "completions/mean_terminated_length": 733.602783203125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7703372223110116, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1216279343940387, "kl": 0.02752685546875, "learning_rate": 2.373528993793698e-07, "loss": 0.0169, "num_tokens": 1984833972.0, "reward": 2.36328125, "reward_std": 0.41990092396736145, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11088934540748596, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1068.2076416015625, "completions/mean_terminated_length": 786.6580200195312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7705503169783177, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12228746702556395, "kl": 0.024993896484375, "learning_rate": 2.3711221195299366e-07, "loss": 0.0552, "num_tokens": 1985382305.0, "reward": 2.3588171005249023, "reward_std": 0.43226316571235657, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14748506247997284, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 953.044677734375, "completions/mean_terminated_length": 770.5521240234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7707634116456236, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1274991674342064, "kl": 0.027740478515625, "learning_rate": 2.368716976860588e-07, "loss": 0.0131, "num_tokens": 1985876341.0, "reward": 2.4542412757873535, "reward_std": 0.41259995102882385, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.1151164099574089, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1006.15185546875, "completions/mean_terminated_length": 800.0106811523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7709765063129295, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13531107781855503, "kl": 0.028533935546875, "learning_rate": 2.3663135671167106e-07, "loss": 0.0406, "num_tokens": 1986404473.0, "reward": 2.3738839626312256, "reward_std": 0.3625422418117523, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10404275357723236, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1004.8594360351562, "completions/mean_terminated_length": 774.62939453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7711896009802355, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12746021769799984, "kl": 0.028106689453125, "learning_rate": 2.3639118916284122e-07, "loss": 0.0557, "num_tokens": 1986918794.0, "reward": 2.4135046005249023, "reward_std": 0.48709607124328613, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1400342434644699, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1077.8148193359375, "completions/mean_terminated_length": 791.8063354492188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.7714026956475414, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11747572131021242, "kl": 0.027587890625, "learning_rate": 2.3615119517248344e-07, "loss": 0.0222, "num_tokens": 1987470599.0, "reward": 2.41015625, "reward_std": 0.4105912446975708, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661912620067596, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 869.8928833007812, "completions/mean_terminated_length": 718.549072265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7716157903148474, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13613917443582463, "kl": 0.030517578125, "learning_rate": 2.3591137487341613e-07, "loss": 0.1153, "num_tokens": 1987924887.0, "reward": 2.6434152126312256, "reward_std": 0.366009384393692, "rewards/accuracy_reward/mean": 0.7120535969734192, "rewards/accuracy_reward/std": 0.4533122181892395, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10854540765285492, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1080.6273193359375, "completions/mean_terminated_length": 854.107421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7718288849821533, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12550457893038833, "kl": 0.023773193359375, "learning_rate": 2.356717283983613e-07, "loss": 0.0763, "num_tokens": 1988480544.0, "reward": 2.2823662757873535, "reward_std": 0.43554142117500305, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1502326875925064, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1054.696533203125, "completions/mean_terminated_length": 873.8575439453125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7720419796494593, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11276633326191235, "kl": 0.025177001953125, "learning_rate": 2.354322558799449e-07, "loss": 0.0419, "num_tokens": 1989021128.0, "reward": 2.3794643878936768, "reward_std": 0.3929978013038635, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09478133171796799, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1020.169677734375, "completions/mean_terminated_length": 829.8306884765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7722550743167652, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11493500529344396, "kl": 0.02374267578125, "learning_rate": 2.351929574506969e-07, "loss": 0.0031, "num_tokens": 1989548884.0, "reward": 2.529017925262451, "reward_std": 0.3697143495082855, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1282729059457779, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 980.6406860351562, "completions/mean_terminated_length": 779.6259765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7724681689840712, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12700642223122208, "kl": 0.027374267578125, "learning_rate": 2.3495383324305058e-07, "loss": 0.0316, "num_tokens": 1990061571.0, "reward": 2.5390625, "reward_std": 0.3659279942512512, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389531940221786, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1062.805908203125, "completions/mean_terminated_length": 842.0792236328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7726812636513771, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12953766316555412, "kl": 0.024932861328125, "learning_rate": 2.347148833893428e-07, "loss": 0.0952, "num_tokens": 1990605596.0, "reward": 2.4112725257873535, "reward_std": 0.47081342339515686, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 937.529052734375, "completions/mean_terminated_length": 717.8101806640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.772894358318683, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4539489850730734, "kl": 0.031402587890625, "learning_rate": 2.344761080218139e-07, "loss": 0.0757, "num_tokens": 1991096457.0, "reward": 2.39453125, "reward_std": 0.37792688608169556, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12315750867128372, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 989.8192138671875, "completions/mean_terminated_length": 770.19677734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.773107452985989, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12234801992637155, "kl": 0.02783203125, "learning_rate": 2.3423750727260813e-07, "loss": 0.0676, "num_tokens": 1991604728.0, "reward": 2.541294813156128, "reward_std": 0.4165656864643097, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13351379334926605, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 941.7210083007812, "completions/mean_terminated_length": 757.3411865234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7733205476532949, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14182930023237067, "kl": 0.027496337890625, "learning_rate": 2.3399908127377246e-07, "loss": 0.0606, "num_tokens": 1992093083.0, "reward": 2.4402902126312256, "reward_std": 0.3861028850078583, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12956927716732025, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 945.2277221679688, "completions/mean_terminated_length": 751.3018188476562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.773533642320601, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13044873391044542, "kl": 0.029998779296875, "learning_rate": 2.3376083015725737e-07, "loss": 0.0592, "num_tokens": 1992585857.0, "reward": 2.3214287757873535, "reward_std": 0.40314850211143494, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16145165264606476, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1006.482177734375, "completions/mean_terminated_length": 786.9189453125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7737467369879069, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13866862165000746, "kl": 0.02685546875, "learning_rate": 2.3352275405491683e-07, "loss": 0.0803, "num_tokens": 1993103561.0, "reward": 2.53515625, "reward_std": 0.41071146726608276, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12979069352149963, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 888.0781860351562, "completions/mean_terminated_length": 748.8875122070312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7739598316552129, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1328342487872296, "kl": 0.032989501953125, "learning_rate": 2.3328485309850772e-07, "loss": 0.0585, "num_tokens": 1993567628.0, "reward": 2.5262277126312256, "reward_std": 0.39783385396003723, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1277548223733902, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1102.22998046875, "completions/mean_terminated_length": 819.8695678710938, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7741729263225188, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11539596183914869, "kl": 0.02471923828125, "learning_rate": 2.3304712741968992e-07, "loss": 0.0406, "num_tokens": 1994134547.0, "reward": 2.325892925262451, "reward_std": 0.3473705053329468, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09178353101015091, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 872.5960083007812, "completions/mean_terminated_length": 772.9855346679688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7743860209898247, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13167877442073989, "kl": 0.033233642578125, "learning_rate": 2.3280957715002638e-07, "loss": 0.0468, "num_tokens": 1994586606.0, "reward": 2.5965402126312256, "reward_std": 0.33661067485809326, "rewards/accuracy_reward/mean": 0.6651785969734192, "rewards/accuracy_reward/std": 0.47245556116104126, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10326440632343292, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 965.52685546875, "completions/mean_terminated_length": 781.8172607421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7745991156571307, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12956765625867853, "kl": 0.028228759765625, "learning_rate": 2.3257220242098294e-07, "loss": 0.0671, "num_tokens": 1995089018.0, "reward": 2.3247768878936768, "reward_std": 0.5176705121994019, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.1785159856081009, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 994.3750610351562, "completions/mean_terminated_length": 799.2592163085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7748122103244366, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12657641872165445, "kl": 0.0302734375, "learning_rate": 2.323350033639287e-07, "loss": 0.0978, "num_tokens": 1995606946.0, "reward": 2.5078125, "reward_std": 0.46482858061790466, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12805373966693878, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 987.357177734375, "completions/mean_terminated_length": 797.5579223632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7750253049917426, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13239562371121621, "kl": 0.02752685546875, "learning_rate": 2.3209798011013458e-07, "loss": 0.1171, "num_tokens": 1996119730.0, "reward": 2.453125, "reward_std": 0.4176524877548218, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14753690361976624, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 951.6607666015625, "completions/mean_terminated_length": 738.239990234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7752383996590485, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12646928223688847, "kl": 0.0289306640625, "learning_rate": 2.318611327907753e-07, "loss": 0.0511, "num_tokens": 1996612122.0, "reward": 2.4135046005249023, "reward_std": 0.3974364697933197, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12909629940986633, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 983.6652221679688, "completions/mean_terminated_length": 789.8944702148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7754514943263545, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15258903985359568, "kl": 0.027984619140625, "learning_rate": 2.316244615369276e-07, "loss": 0.0741, "num_tokens": 1997121876.0, "reward": 2.4268975257873535, "reward_std": 0.4359312951564789, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14680634438991547, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1171.825927734375, "completions/mean_terminated_length": 883.2344360351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7756645889936604, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1249174755472304, "kl": 0.021881103515625, "learning_rate": 2.313879664795709e-07, "loss": 0.076, "num_tokens": 1997727270.0, "reward": 2.333705425262451, "reward_std": 0.4284088909626007, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15247619152069092, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 881.9553833007812, "completions/mean_terminated_length": 662.3554077148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7758776836609664, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14867666267634752, "kl": 0.029296875, "learning_rate": 2.3115164774958702e-07, "loss": 0.1024, "num_tokens": 1998193442.0, "reward": 2.3761162757873535, "reward_std": 0.41546785831451416, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15155641734600067, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 989.0960083007812, "completions/mean_terminated_length": 789.6737060546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7760907783282723, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12126398754873464, "kl": 0.029205322265625, "learning_rate": 2.3091550547776023e-07, "loss": 0.0332, "num_tokens": 1998705789.0, "reward": 2.5396206378936768, "reward_std": 0.41325685381889343, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12130890786647797, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 879.3482666015625, "completions/mean_terminated_length": 739.1099853515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7763038729955782, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1348613642027373, "kl": 0.0330810546875, "learning_rate": 2.3067953979477747e-07, "loss": 0.0594, "num_tokens": 1999164969.0, "reward": 2.4955358505249023, "reward_std": 0.3621552586555481, "rewards/accuracy_reward/mean": 0.6157407164573669, "rewards/accuracy_reward/std": 0.48698362708091736, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 941.1607666015625, "completions/mean_terminated_length": 776.5538940429688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7765169676628842, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1290798824179389, "kl": 0.029510498046875, "learning_rate": 2.304437508312275e-07, "loss": 0.0633, "num_tokens": 1999654449.0, "reward": 2.4095983505249023, "reward_std": 0.41413557529449463, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1253739446401596, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1144.859375, "completions/mean_terminated_length": 871.8168334960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7767300623301902, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12082092667679627, "kl": 0.024749755859375, "learning_rate": 2.3020813871760157e-07, "loss": 0.0711, "num_tokens": 2000238882.0, "reward": 2.368861675262451, "reward_std": 0.4545716345310211, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1590748131275177, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 954.8058471679688, "completions/mean_terminated_length": 765.9293212890625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7769431569974962, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12355079622083395, "kl": 0.030517578125, "learning_rate": 2.2997270358429283e-07, "loss": 0.0449, "num_tokens": 2000738667.0, "reward": 2.5200893878936768, "reward_std": 0.4130350649356842, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10289046913385391, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 981.4152221679688, "completions/mean_terminated_length": 753.0677490234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7771562516648021, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1374693830646525, "kl": 0.028717041015625, "learning_rate": 2.2973744556159668e-07, "loss": 0.1158, "num_tokens": 2001240261.0, "reward": 2.462611675262451, "reward_std": 0.42233961820602417, "rewards/accuracy_reward/mean": 0.5995370149612427, "rewards/accuracy_reward/std": 0.4905603229999542, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.141964390873909, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 971.4085083007812, "completions/mean_terminated_length": 754.9356689453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7773693463321081, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14054032119794602, "kl": 0.028350830078125, "learning_rate": 2.2950236477971035e-07, "loss": 0.0882, "num_tokens": 2001748428.0, "reward": 2.4654018878936768, "reward_std": 0.42282602190971375, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14929908514022827, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 959.7277221679688, "completions/mean_terminated_length": 758.1957397460938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.777582440999414, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12800211027742053, "kl": 0.029632568359375, "learning_rate": 2.2926746136873305e-07, "loss": 0.0499, "num_tokens": 2002244354.0, "reward": 2.4916296005249023, "reward_std": 0.3784066140651703, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12292414903640747, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 966.4553833007812, "completions/mean_terminated_length": 779.5916137695312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.77779553566672, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11776831389861644, "kl": 0.02777099609375, "learning_rate": 2.29032735458666e-07, "loss": 0.0313, "num_tokens": 2002743198.0, "reward": 2.486049175262451, "reward_std": 0.3369090259075165, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11214316636323929, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1019.825927734375, "completions/mean_terminated_length": 806.4312744140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7780086303340259, "frac_reward_zero_std": 0.25, "grad_norm": 0.2513765332487112, "kl": 0.039306640625, "learning_rate": 2.2879818717941193e-07, "loss": 0.0801, "num_tokens": 2003268400.0, "reward": 2.4268975257873535, "reward_std": 0.3885403573513031, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1273927539587021, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1058.8348388671875, "completions/mean_terminated_length": 863.11767578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7782217250013318, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13243359433625523, "kl": 0.026763916015625, "learning_rate": 2.2856381666077533e-07, "loss": 0.0771, "num_tokens": 2003810902.0, "reward": 2.5066964626312256, "reward_std": 0.4575176537036896, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13422717154026031, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 987.4397583007812, "completions/mean_terminated_length": 804.2015991210938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7784348196686378, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14586084636403504, "kl": 0.02801513671875, "learning_rate": 2.283296240324624e-07, "loss": 0.0757, "num_tokens": 2004323131.0, "reward": 2.4308037757873535, "reward_std": 0.42310458421707153, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1480942964553833, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1115.607177734375, "completions/mean_terminated_length": 837.2406005859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7786479143359437, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10713449031580556, "kl": 0.02374267578125, "learning_rate": 2.2809560942408064e-07, "loss": 0.1111, "num_tokens": 2004895835.0, "reward": 2.3677456378936768, "reward_std": 0.39975327253341675, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11660738289356232, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1134.997802734375, "completions/mean_terminated_length": 862.4203491210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7788610090032497, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1320323343113309, "kl": 0.025543212890625, "learning_rate": 2.2786177296513967e-07, "loss": 0.0904, "num_tokens": 2005473098.0, "reward": 2.365513563156128, "reward_std": 0.4179643988609314, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11898136883974075, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 975.55810546875, "completions/mean_terminated_length": 766.789306640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7790741036705556, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1280844386746548, "kl": 0.028228759765625, "learning_rate": 2.276281147850495e-07, "loss": 0.0305, "num_tokens": 2005974356.0, "reward": 2.4776787757873535, "reward_std": 0.35402682423591614, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09137456864118576, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 974.7120971679688, "completions/mean_terminated_length": 758.9035034179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7792871983378616, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14623424390506726, "kl": 0.030487060546875, "learning_rate": 2.2739463501312245e-07, "loss": 0.0807, "num_tokens": 2006482035.0, "reward": 2.5479912757873535, "reward_std": 0.4567717909812927, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13455694913864136, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 897.5982666015625, "completions/mean_terminated_length": 655.0811157226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7795002930051675, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14366237493308276, "kl": 0.031890869140625, "learning_rate": 2.271613337785716e-07, "loss": 0.0686, "num_tokens": 2006948911.0, "reward": 2.5206475257873535, "reward_std": 0.375855028629303, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517839670181274, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1167.1451416015625, "completions/mean_terminated_length": 883.9203491210938, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7797133876724734, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12655777725008752, "kl": 0.0240478515625, "learning_rate": 2.2692821121051133e-07, "loss": 0.0763, "num_tokens": 2007538256.0, "reward": 2.3878350257873535, "reward_std": 0.46959805488586426, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16374634206295013, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1101.3773193359375, "completions/mean_terminated_length": 839.77490234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7799264823397795, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12352040421287824, "kl": 0.026763916015625, "learning_rate": 2.266952674379571e-07, "loss": 0.0815, "num_tokens": 2008098857.0, "reward": 2.3722100257873535, "reward_std": 0.4799840748310089, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14461299777030945, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1060.430908203125, "completions/mean_terminated_length": 812.1591796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7801395770070854, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1641267105861403, "kl": 0.030364990234375, "learning_rate": 2.2646250258982536e-07, "loss": 0.0677, "num_tokens": 2008650250.0, "reward": 2.361607313156128, "reward_std": 0.49516233801841736, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15883232653141022, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1031.640625, "completions/mean_terminated_length": 820.6981201171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7803526716743914, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13777478562514822, "kl": 0.02581787109375, "learning_rate": 2.2622991679493388e-07, "loss": 0.0633, "num_tokens": 2009180633.0, "reward": 2.3203125, "reward_std": 0.4260157346725464, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1569942682981491, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 872.997802734375, "completions/mean_terminated_length": 731.9974975585938, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.7805657663416973, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12872747447833294, "kl": 0.031158447265625, "learning_rate": 2.2599751018200094e-07, "loss": 0.0115, "num_tokens": 2009638200.0, "reward": 2.5262277126312256, "reward_std": 0.3268493413925171, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.13086353242397308, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1036.4107666015625, "completions/mean_terminated_length": 789.1333618164062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7807788610090033, "frac_reward_zero_std": 0.0, "grad_norm": 0.1448101448321292, "kl": 0.028839111328125, "learning_rate": 2.257652828796459e-07, "loss": 0.0808, "num_tokens": 2010177504.0, "reward": 2.3253350257873535, "reward_std": 0.48469600081443787, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.4988385736942291, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1508246213197708, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1051.0223388671875, "completions/mean_terminated_length": 827.6557006835938, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7809919556763092, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11972620554431765, "kl": 0.02435302734375, "learning_rate": 2.2553323501638865e-07, "loss": 0.0501, "num_tokens": 2010724442.0, "reward": 2.3950893878936768, "reward_std": 0.41521820425987244, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14626215398311615, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1126.1004638671875, "completions/mean_terminated_length": 861.186767578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7812050503436152, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12409003575996988, "kl": 0.024627685546875, "learning_rate": 2.2530136672065004e-07, "loss": 0.0778, "num_tokens": 2011294039.0, "reward": 2.349888563156128, "reward_std": 0.4730124771595001, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14515583217144012, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1002.33935546875, "completions/mean_terminated_length": 739.4636840820312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7814181450109211, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12166091373410194, "kl": 0.027130126953125, "learning_rate": 2.2506967812075142e-07, "loss": 0.0724, "num_tokens": 2011811135.0, "reward": 2.453125, "reward_std": 0.4462078809738159, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17205262184143066, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1043.0648193359375, "completions/mean_terminated_length": 797.4138793945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.781631239678227, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11743546623208966, "kl": 0.0279541015625, "learning_rate": 2.2483816934491449e-07, "loss": 0.0568, "num_tokens": 2012343676.0, "reward": 2.407924175262451, "reward_std": 0.4028477072715759, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13776934146881104, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1113.84375, "completions/mean_terminated_length": 862.44189453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.781844334345533, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11042146865188092, "kl": 0.024444580078125, "learning_rate": 2.2460684052126195e-07, "loss": 0.0655, "num_tokens": 2012921238.0, "reward": 2.3565850257873535, "reward_std": 0.4532468914985657, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1422542929649353, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1136.0960693359375, "completions/mean_terminated_length": 884.0883178710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7820574290128389, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11780196591125137, "kl": 0.0245361328125, "learning_rate": 2.2437569177781663e-07, "loss": 0.0575, "num_tokens": 2013504385.0, "reward": 2.3046875, "reward_std": 0.3964941203594208, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.4935782551765442, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14164415001869202, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1031.7232666015625, "completions/mean_terminated_length": 765.4873046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7822705236801449, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1974781473347365, "kl": 0.043060302734375, "learning_rate": 2.2414472324250117e-07, "loss": 0.0773, "num_tokens": 2014035461.0, "reward": 2.3470983505249023, "reward_std": 0.4678664803504944, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.1816219538450241, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1020.87060546875, "completions/mean_terminated_length": 748.1299438476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7824836183474508, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12375618884014736, "kl": 0.02838134765625, "learning_rate": 2.239139350431395e-07, "loss": 0.0459, "num_tokens": 2014569435.0, "reward": 2.353794813156128, "reward_std": 0.35637223720550537, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10537806153297424, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1010.3951416015625, "completions/mean_terminated_length": 784.8287963867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7826967130147569, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12761523859488438, "kl": 0.027923583984375, "learning_rate": 2.2368332730745483e-07, "loss": 0.082, "num_tokens": 2015089836.0, "reward": 2.458705425262451, "reward_std": 0.40176674723625183, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 920.794677734375, "completions/mean_terminated_length": 782.3659057617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7829098076820628, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13777751915720313, "kl": 0.0279541015625, "learning_rate": 2.2345290016307138e-07, "loss": 0.0507, "num_tokens": 2015572720.0, "reward": 2.482142925262451, "reward_std": 0.39352041482925415, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 954.3192138671875, "completions/mean_terminated_length": 734.4102172851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7831229023493687, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13990265677542843, "kl": 0.029571533203125, "learning_rate": 2.2322265373751238e-07, "loss": 0.1061, "num_tokens": 2016064287.0, "reward": 2.3995537757873535, "reward_std": 0.48027709126472473, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15883232653141022, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1058.0067138671875, "completions/mean_terminated_length": 832.8849487304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7833359970166747, "frac_reward_zero_std": 0.0, "grad_norm": 0.13810124635676385, "kl": 0.02728271484375, "learning_rate": 2.22992588158202e-07, "loss": 0.0608, "num_tokens": 2016604258.0, "reward": 2.239955425262451, "reward_std": 0.4449780583381653, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1591111123561859, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1066.203125, "completions/mean_terminated_length": 809.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7835490916839806, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1219721677944319, "kl": 0.025390625, "learning_rate": 2.2276270355246374e-07, "loss": 0.0726, "num_tokens": 2017158909.0, "reward": 2.37109375, "reward_std": 0.4210568964481354, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15941192209720612, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1013.9910888671875, "completions/mean_terminated_length": 812.7039794921875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7837621863512866, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11583766678287374, "kl": 0.026336669921875, "learning_rate": 2.2253300004752125e-07, "loss": 0.0695, "num_tokens": 2017682521.0, "reward": 2.4369421005249023, "reward_std": 0.4287792146205902, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12540756165981293, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 981.5870971679688, "completions/mean_terminated_length": 800.6031494140625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7839752810185925, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1418503990547118, "kl": 0.02716064453125, "learning_rate": 2.2230347777049768e-07, "loss": 0.087, "num_tokens": 2018192496.0, "reward": 2.3833706378936768, "reward_std": 0.5248706936836243, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15765586495399475, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 962.0313110351562, "completions/mean_terminated_length": 777.7284545898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7841883756858985, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1333875099276704, "kl": 0.0286865234375, "learning_rate": 2.2207413684841607e-07, "loss": 0.0776, "num_tokens": 2018691390.0, "reward": 2.30078125, "reward_std": 0.4338666498661041, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 902.8214721679688, "completions/mean_terminated_length": 762.1854858398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7844014703532044, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14520749362410013, "kl": 0.030120849609375, "learning_rate": 2.218449774081994e-07, "loss": 0.084, "num_tokens": 2019160462.0, "reward": 2.549107313156128, "reward_std": 0.5027464032173157, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15456202626228333, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 899.82373046875, "completions/mean_terminated_length": 683.5888671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7846145650205104, "frac_reward_zero_std": 0.25, "grad_norm": 0.12764753318983862, "kl": 0.028656005859375, "learning_rate": 2.2161599957666944e-07, "loss": 0.0437, "num_tokens": 2019631167.0, "reward": 2.4246652126312256, "reward_std": 0.3342822790145874, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10500273108482361, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1065.279052734375, "completions/mean_terminated_length": 835.165283203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7848276596878163, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1135853342225109, "kl": 0.025726318359375, "learning_rate": 2.2138720348054796e-07, "loss": 0.0671, "num_tokens": 2020176108.0, "reward": 2.263951063156128, "reward_std": 0.4311540722846985, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14869897067546844, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 994.904052734375, "completions/mean_terminated_length": 786.5374755859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7850407543551222, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12697064415422812, "kl": 0.026123046875, "learning_rate": 2.2115858924645635e-07, "loss": 0.0537, "num_tokens": 2020703201.0, "reward": 2.5011162757873535, "reward_std": 0.39964696764945984, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10269007831811905, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 991.7745971679688, "completions/mean_terminated_length": 751.591796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7852538490224282, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13340527552487247, "kl": 0.029144287109375, "learning_rate": 2.209301570009149e-07, "loss": 0.0945, "num_tokens": 2021212524.0, "reward": 2.48046875, "reward_std": 0.4135218560695648, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12909629940986633, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 870.6741333007812, "completions/mean_terminated_length": 729.3949584960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7854669436897341, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12423435706545113, "kl": 0.0306396484375, "learning_rate": 2.2070190687034334e-07, "loss": 0.0459, "num_tokens": 2021675066.0, "reward": 2.6099331378936768, "reward_std": 0.34556421637535095, "rewards/accuracy_reward/mean": 0.6785714030265808, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10724954307079315, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 944.9442138671875, "completions/mean_terminated_length": 777.6427001953125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.7856800383570401, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13382084548346312, "kl": 0.02935791015625, "learning_rate": 2.2047383898106065e-07, "loss": 0.0479, "num_tokens": 2022164705.0, "reward": 2.4888393878936768, "reward_std": 0.4068002998828888, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855977296829224, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09624522179365158, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1041.22998046875, "completions/mean_terminated_length": 812.2931518554688, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.785893133024346, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1119449561398047, "kl": 0.027008056640625, "learning_rate": 2.202459534592851e-07, "loss": 0.0281, "num_tokens": 2022703384.0, "reward": 2.4213171005249023, "reward_std": 0.4229731857776642, "rewards/accuracy_reward/mean": 0.5208333134651184, "rewards/accuracy_reward/std": 0.5001450181007385, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10481233894824982, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 967.591552734375, "completions/mean_terminated_length": 757.27197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7861062276916521, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1470578422340937, "kl": 0.033721923828125, "learning_rate": 2.2001825043113393e-07, "loss": 0.0995, "num_tokens": 2023204177.0, "reward": 2.4129464626312256, "reward_std": 0.42100703716278076, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13318143784999847, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1068.7366943359375, "completions/mean_terminated_length": 787.3390502929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.786319322358958, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1196608706978701, "kl": 0.024627685546875, "learning_rate": 2.1979073002262294e-07, "loss": 0.0837, "num_tokens": 2023755707.0, "reward": 2.4486608505249023, "reward_std": 0.3850461542606354, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.19128035008907318, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09730303287506104, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 958.3035888671875, "completions/mean_terminated_length": 773.3681640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7865324170262639, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12130150411124722, "kl": 0.02642822265625, "learning_rate": 2.1956339235966764e-07, "loss": 0.073, "num_tokens": 2024252691.0, "reward": 2.5005581378936768, "reward_std": 0.32296568155288696, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12292414158582687, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 916.0469360351562, "completions/mean_terminated_length": 751.0307006835938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7867455116935699, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11505428708261616, "kl": 0.03076171875, "learning_rate": 2.1933623756808193e-07, "loss": 0.0301, "num_tokens": 2024729992.0, "reward": 2.4464287757873535, "reward_std": 0.3755577802658081, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08382127434015274, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 903.872802734375, "completions/mean_terminated_length": 695.5752563476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7869586063608758, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13296898931208118, "kl": 0.033599853515625, "learning_rate": 2.1910926577357858e-07, "loss": 0.0373, "num_tokens": 2025204735.0, "reward": 2.322544813156128, "reward_std": 0.42915159463882446, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1292569637298584, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1077.5223388671875, "completions/mean_terminated_length": 826.7247314453125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7871717010281818, "frac_reward_zero_std": 0.0, "grad_norm": 0.14054347357368147, "kl": 0.02691650390625, "learning_rate": 2.1888247710176905e-07, "loss": 0.0776, "num_tokens": 2025761785.0, "reward": 2.3699777126312256, "reward_std": 0.5200629234313965, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15792487561702728, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 883.841552734375, "completions/mean_terminated_length": 753.8486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7873847956954877, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13327169568288547, "kl": 0.029388427734375, "learning_rate": 2.1865587167816346e-07, "loss": 0.0492, "num_tokens": 2026231090.0, "reward": 2.333705425262451, "reward_std": 0.3993498980998993, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.8816964030265808, "rewards/format_reward/std": 0.32332828640937805, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.15497742593288422, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 957.79248046875, "completions/mean_terminated_length": 795.6589965820312, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7875978903627937, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13060439677947236, "kl": 0.027679443359375, "learning_rate": 2.1842944962817077e-07, "loss": 0.0829, "num_tokens": 2026728981.0, "reward": 2.5736608505249023, "reward_std": 0.43459370732307434, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.46923142671585083, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1086.435302734375, "completions/mean_terminated_length": 851.3861083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7878109850300996, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11605514449486162, "kl": 0.0260009765625, "learning_rate": 2.1820321107709817e-07, "loss": 0.0571, "num_tokens": 2027290696.0, "reward": 2.3130581378936768, "reward_std": 0.43512123823165894, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517839670181274, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 896.325927734375, "completions/mean_terminated_length": 686.6543579101562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7880240796974056, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14635062310934602, "kl": 0.031280517578125, "learning_rate": 2.179771561501514e-07, "loss": 0.0616, "num_tokens": 2027767098.0, "reward": 2.490513563156128, "reward_std": 0.42108434438705444, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13153910636901855, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 923.1785888671875, "completions/mean_terminated_length": 742.5077514648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7882371743647115, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12043100363810931, "kl": 0.029083251953125, "learning_rate": 2.1775128497243445e-07, "loss": 0.0485, "num_tokens": 2028247722.0, "reward": 2.4720983505249023, "reward_std": 0.4003392457962036, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1034.546875, "completions/mean_terminated_length": 797.2369384765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7884502690320174, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1310006120665886, "kl": 0.02679443359375, "learning_rate": 2.1752559766894978e-07, "loss": 0.0751, "num_tokens": 2028784079.0, "reward": 2.310267925262451, "reward_std": 0.4275436997413635, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1150.6629638671875, "completions/mean_terminated_length": 795.6417236328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7886633636993234, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13043937884250986, "kl": 0.023773193359375, "learning_rate": 2.1730009436459812e-07, "loss": 0.1232, "num_tokens": 2029366968.0, "reward": 2.2566964626312256, "reward_std": 0.5472320914268494, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.18130891025066376, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1035.357177734375, "completions/mean_terminated_length": 847.8306884765625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7888764583666293, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11786473367710062, "kl": 0.0260009765625, "learning_rate": 2.1707477518417806e-07, "loss": 0.0888, "num_tokens": 2029901864.0, "reward": 2.4497768878936768, "reward_std": 0.37727296352386475, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11015089601278305, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1061.2210693359375, "completions/mean_terminated_length": 887.6929321289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7890895530339354, "frac_reward_zero_std": 0.0, "grad_norm": 0.13375056899691412, "kl": 0.025421142578125, "learning_rate": 2.1684964025238684e-07, "loss": 0.0562, "num_tokens": 2030447803.0, "reward": 2.478236675262451, "reward_std": 0.42049261927604675, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12338031083345413, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 971.33935546875, "completions/mean_terminated_length": 791.8958740234375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7893026477012413, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12858110366780875, "kl": 0.02825927734375, "learning_rate": 2.166246896938192e-07, "loss": 0.0667, "num_tokens": 2030950275.0, "reward": 2.450892925262451, "reward_std": 0.49948036670684814, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 936.3348388671875, "completions/mean_terminated_length": 737.4052734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7895157423685473, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14633276212892932, "kl": 0.033294677734375, "learning_rate": 2.163999236329681e-07, "loss": 0.088, "num_tokens": 2031433977.0, "reward": 2.5396206378936768, "reward_std": 0.48697176575660706, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15557540953159332, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1003.810302734375, "completions/mean_terminated_length": 803.8590087890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7897288370358532, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12174580562225036, "kl": 0.025146484375, "learning_rate": 2.1617534219422445e-07, "loss": 0.0556, "num_tokens": 2031958724.0, "reward": 2.3900671005249023, "reward_std": 0.35458117723464966, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11214316636323929, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1006.4576416015625, "completions/mean_terminated_length": 790.2883911132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7899419317031592, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12339277743912677, "kl": 0.027801513671875, "learning_rate": 2.159509455018766e-07, "loss": 0.0387, "num_tokens": 2032481793.0, "reward": 2.4765625, "reward_std": 0.48525917530059814, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12585100531578064, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 985.9397583007812, "completions/mean_terminated_length": 789.2619018554688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7901550263704651, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.2479972637988037, "kl": 0.02691650390625, "learning_rate": 2.1572673368011141e-07, "loss": 0.0374, "num_tokens": 2032989526.0, "reward": 2.490513563156128, "reward_std": 0.41473469138145447, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11660738289356232, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1100.2410888671875, "completions/mean_terminated_length": 918.7553100585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.790368121037771, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11057306656234722, "kl": 0.02630615234375, "learning_rate": 2.1550270685301248e-07, "loss": 0.034, "num_tokens": 2033552466.0, "reward": 2.4151787757873535, "reward_std": 0.403978556394577, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11849904805421829, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 915.1897583007812, "completions/mean_terminated_length": 743.3753051757812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.790581215705077, "frac_reward_zero_std": 0.0, "grad_norm": 0.14854224376765482, "kl": 0.0350341796875, "learning_rate": 2.1527886514456178e-07, "loss": 0.0571, "num_tokens": 2034026679.0, "reward": 2.5558037757873535, "reward_std": 0.4550960063934326, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 949.05810546875, "completions/mean_terminated_length": 762.5535278320312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7907943103723829, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1263606279364349, "kl": 0.028778076171875, "learning_rate": 2.150552086786385e-07, "loss": 0.0869, "num_tokens": 2034515489.0, "reward": 2.4799108505249023, "reward_std": 0.441692054271698, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14238695800304413, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1065.9710693359375, "completions/mean_terminated_length": 836.019287109375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7910074050396889, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12221103511043187, "kl": 0.024169921875, "learning_rate": 2.1483173757901947e-07, "loss": 0.049, "num_tokens": 2035062452.0, "reward": 2.38671875, "reward_std": 0.3715497851371765, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.09848741441965103, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1068.01123046875, "completions/mean_terminated_length": 838.5372314453125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7912204997069948, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12515750330428307, "kl": 0.024932861328125, "learning_rate": 2.146084519693787e-07, "loss": 0.0741, "num_tokens": 2035608425.0, "reward": 2.486049175262451, "reward_std": 0.44873178005218506, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12086557596921921, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 952.372802734375, "completions/mean_terminated_length": 779.677001953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7914335943743008, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1408033120560987, "kl": 0.02996826171875, "learning_rate": 2.1438535197328755e-07, "loss": 0.0732, "num_tokens": 2036106720.0, "reward": 2.4994421005249023, "reward_std": 0.4345710873603821, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12992529571056366, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 938.497802734375, "completions/mean_terminated_length": 760.2875366210938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7916466890416067, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371792415767387, "kl": 0.03082275390625, "learning_rate": 2.1416243771421517e-07, "loss": 0.1118, "num_tokens": 2036589951.0, "reward": 2.5691964626312256, "reward_std": 0.47533902525901794, "rewards/accuracy_reward/mean": 0.7008928656578064, "rewards/accuracy_reward/std": 0.45837873220443726, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1671055108308792, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 985.5313110351562, "completions/mean_terminated_length": 754.559814453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7918597837089126, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12873672556769974, "kl": 0.02862548828125, "learning_rate": 2.139397093155273e-07, "loss": 0.0762, "num_tokens": 2037100029.0, "reward": 2.3582589626312256, "reward_std": 0.411371111869812, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12914101779460907, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1050.82373046875, "completions/mean_terminated_length": 856.7066650390625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7920728783762186, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1333956924941047, "kl": 0.024993896484375, "learning_rate": 2.137171669004871e-07, "loss": 0.1113, "num_tokens": 2037642190.0, "reward": 2.42578125, "reward_std": 0.5111509561538696, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15880775451660156, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1001.2232666015625, "completions/mean_terminated_length": 813.9052734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.7922859730435245, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11934848126811798, "kl": 0.026397705078125, "learning_rate": 2.1349481059225477e-07, "loss": 0.0924, "num_tokens": 2038160066.0, "reward": 2.5753350257873535, "reward_std": 0.4170945882797241, "rewards/accuracy_reward/mean": 0.6944444179534912, "rewards/accuracy_reward/std": 0.46117642521858215, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15448831021785736, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 943.8951416015625, "completions/mean_terminated_length": 759.8776245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7924990677108306, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11582637769836876, "kl": 0.02703857421875, "learning_rate": 2.1327264051388755e-07, "loss": 0.0414, "num_tokens": 2038651987.0, "reward": 2.4972100257873535, "reward_std": 0.3424782156944275, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.1125321164727211, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 938.4531860351562, "completions/mean_terminated_length": 786.3832397460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7927121623781365, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14429382426113838, "kl": 0.0302734375, "learning_rate": 2.1305065678833948e-07, "loss": 0.1026, "num_tokens": 2039144254.0, "reward": 2.498326063156128, "reward_std": 0.519925057888031, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.17201544344425201, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 950.4910888671875, "completions/mean_terminated_length": 729.8123779296875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7929252570454425, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1485087061901842, "kl": 0.029815673828125, "learning_rate": 2.1282885953846146e-07, "loss": 0.0699, "num_tokens": 2039639386.0, "reward": 2.5011162757873535, "reward_std": 0.4704189598560333, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13033421337604523, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1150.671875, "completions/mean_terminated_length": 886.1416015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7931383517127484, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11780789927320942, "kl": 0.024871826171875, "learning_rate": 2.1260724888700154e-07, "loss": 0.0422, "num_tokens": 2040238471.0, "reward": 2.318080425262451, "reward_std": 0.43737876415252686, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1569942682981491, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 932.40185546875, "completions/mean_terminated_length": 779.5025024414062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7933514463800544, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13564200236173438, "kl": 0.0302734375, "learning_rate": 2.1238582495660437e-07, "loss": 0.0868, "num_tokens": 2040719307.0, "reward": 2.505580425262451, "reward_std": 0.44408172369003296, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1100.0625, "completions/mean_terminated_length": 868.344482421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7935645410473603, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11664149734504703, "kl": 0.025634765625, "learning_rate": 2.1216458786981057e-07, "loss": 0.0429, "num_tokens": 2041283687.0, "reward": 2.415736675262451, "reward_std": 0.42259514331817627, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12450840324163437, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1074.841552734375, "completions/mean_terminated_length": 761.9380493164062, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.7937776357146662, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1391687292354972, "kl": 0.027191162109375, "learning_rate": 2.119435377490585e-07, "loss": 0.0895, "num_tokens": 2041836048.0, "reward": 2.338169813156128, "reward_std": 0.4713340103626251, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17736539244651794, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 927.2879638671875, "completions/mean_terminated_length": 786.4949951171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.7939907303819722, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12375926531322826, "kl": 0.028839111328125, "learning_rate": 2.117226747166821e-07, "loss": 0.0934, "num_tokens": 2042312721.0, "reward": 2.6339287757873535, "reward_std": 0.374872624874115, "rewards/accuracy_reward/mean": 0.7165178656578064, "rewards/accuracy_reward/std": 0.4511922299861908, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11731318384408951, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 955.4866333007812, "completions/mean_terminated_length": 773.4010620117188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7942038250492781, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12880166313406163, "kl": 0.027557373046875, "learning_rate": 2.115019988949126e-07, "loss": 0.0892, "num_tokens": 2042814059.0, "reward": 2.4737725257873535, "reward_std": 0.48032593727111816, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1400342434644699, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1129.1429443359375, "completions/mean_terminated_length": 851.3488159179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7944169197165841, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1245878336278141, "kl": 0.02484130859375, "learning_rate": 2.1128151040587656e-07, "loss": 0.0961, "num_tokens": 2043390811.0, "reward": 2.2818081378936768, "reward_std": 0.4619499146938324, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14585080742835999, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1053.5379638671875, "completions/mean_terminated_length": 785.906494140625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.79463001438389, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12094039107319492, "kl": 0.02728271484375, "learning_rate": 2.1106120937159802e-07, "loss": 0.0882, "num_tokens": 2043930764.0, "reward": 2.4034600257873535, "reward_std": 0.41054195165634155, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11921197921037674, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1106.4375, "completions/mean_terminated_length": 885.96142578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.794843109051196, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1215029299627093, "kl": 0.0257568359375, "learning_rate": 2.1084109591399657e-07, "loss": 0.0593, "num_tokens": 2044493664.0, "reward": 2.4676339626312256, "reward_std": 0.5396391749382019, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822286784648895, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15520283579826355, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1151.60498046875, "completions/mean_terminated_length": 890.6945190429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7950562037185019, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11133902821550544, "kl": 0.0233154296875, "learning_rate": 2.1062117015488807e-07, "loss": 0.0571, "num_tokens": 2045081695.0, "reward": 2.345982313156128, "reward_std": 0.4985843002796173, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1540280133485794, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1099.779052734375, "completions/mean_terminated_length": 841.17333984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7952692983858078, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12075919170782645, "kl": 0.0235595703125, "learning_rate": 2.104014322159847e-07, "loss": 0.082, "num_tokens": 2045641052.0, "reward": 2.2840402126312256, "reward_std": 0.426430881023407, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15371057391166687, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1097.732177734375, "completions/mean_terminated_length": 852.1572875976562, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7954823930531139, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12949899826555375, "kl": 0.024505615234375, "learning_rate": 2.1018188221889437e-07, "loss": 0.0997, "num_tokens": 2046207460.0, "reward": 2.4609375, "reward_std": 0.4859558641910553, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15964375436306, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 974.10498046875, "completions/mean_terminated_length": 761.6229858398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7956954877204198, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15092354572097386, "kl": 0.028076171875, "learning_rate": 2.0996252028512163e-07, "loss": 0.1397, "num_tokens": 2046712707.0, "reward": 2.4447546005249023, "reward_std": 0.47263821959495544, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 919.8638916015625, "completions/mean_terminated_length": 752.8590087890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7959085823877258, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1488288799189458, "kl": 0.0284423828125, "learning_rate": 2.0974334653606617e-07, "loss": 0.0818, "num_tokens": 2047199990.0, "reward": 2.4893975257873535, "reward_std": 0.4314475953578949, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11517468839883804, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 937.6607666015625, "completions/mean_terminated_length": 752.6041870117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7961216770550317, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13175590074076265, "kl": 0.029266357421875, "learning_rate": 2.095243610930238e-07, "loss": 0.1074, "num_tokens": 2047690414.0, "reward": 2.4291296005249023, "reward_std": 0.42258235812187195, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13850137591362, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 903.51123046875, "completions/mean_terminated_length": 723.1137084960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7963347717223377, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13573423016954303, "kl": 0.0311279296875, "learning_rate": 2.0930556407718658e-07, "loss": 0.0614, "num_tokens": 2048154563.0, "reward": 2.3934152126312256, "reward_std": 0.4178394079208374, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16459043323993683, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 939.9576416015625, "completions/mean_terminated_length": 758.6415405273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7965478663896436, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13468923267631736, "kl": 0.030731201171875, "learning_rate": 2.090869556096417e-07, "loss": 0.0627, "num_tokens": 2048641936.0, "reward": 2.4285714626312256, "reward_std": 0.419168621301651, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10152244567871094, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 923.0469360351562, "completions/mean_terminated_length": 762.3392944335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7967609610569496, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.18708771396666835, "kl": 0.03289794921875, "learning_rate": 2.0886853581137214e-07, "loss": 0.071, "num_tokens": 2049122517.0, "reward": 2.388392925262451, "reward_std": 0.4342552125453949, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14040927588939667, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1201.9576416015625, "completions/mean_terminated_length": 902.9033203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7969740557242555, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12833860171497077, "kl": 0.02337646484375, "learning_rate": 2.0865030480325653e-07, "loss": 0.08, "num_tokens": 2049737938.0, "reward": 2.244419813156128, "reward_std": 0.49510660767555237, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17233900725841522, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1019.1160888671875, "completions/mean_terminated_length": 760.4580688476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7971871503915614, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11926554649229412, "kl": 0.026336669921875, "learning_rate": 2.084322627060693e-07, "loss": 0.0597, "num_tokens": 2050260918.0, "reward": 2.361607313156128, "reward_std": 0.37909311056137085, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10947446525096893, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 997.5000610351562, "completions/mean_terminated_length": 714.7875366210938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7974002450588674, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 1.8659877269190053, "kl": 0.11663818359375, "learning_rate": 2.0821440964047993e-07, "loss": 0.0736, "num_tokens": 2050792182.0, "reward": 2.3392858505249023, "reward_std": 0.4276411235332489, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1061.634033203125, "completions/mean_terminated_length": 785.451416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7976133397261733, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12145760039104567, "kl": 0.025238037109375, "learning_rate": 2.0799674572705317e-07, "loss": 0.0867, "num_tokens": 2051344898.0, "reward": 2.3208706378936768, "reward_std": 0.3850502371788025, "rewards/accuracy_reward/mean": 0.4399038553237915, "rewards/accuracy_reward/std": 0.4969730079174042, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12803789973258972, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1027.8125, "completions/mean_terminated_length": 767.7647094726562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7978264343934793, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1316396287913759, "kl": 0.027587890625, "learning_rate": 2.0777927108624966e-07, "loss": 0.073, "num_tokens": 2051871374.0, "reward": 2.3816964626312256, "reward_std": 0.4216080904006958, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11531686037778854, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1151.7545166015625, "completions/mean_terminated_length": 867.064697265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7980395290607852, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10326426520344567, "kl": 0.022308349609375, "learning_rate": 2.0756198583842478e-07, "loss": 0.0446, "num_tokens": 2052459264.0, "reward": 2.3136162757873535, "reward_std": 0.3580675423145294, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09605696052312851, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1020.2589721679688, "completions/mean_terminated_length": 816.9091186523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7982526237280912, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14498698957856446, "kl": 0.026092529296875, "learning_rate": 2.0734489010382938e-07, "loss": 0.0663, "num_tokens": 2052986660.0, "reward": 2.334263563156128, "reward_std": 0.4096223711967468, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16142748296260834, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1150.78125, "completions/mean_terminated_length": 833.637451171875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7984657183953972, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13706398143237442, "kl": 0.023193359375, "learning_rate": 2.0712798400260933e-07, "loss": 0.086, "num_tokens": 2053570770.0, "reward": 2.299107313156128, "reward_std": 0.4727000594139099, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 944.5201416015625, "completions/mean_terminated_length": 693.591796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.798678813062703, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1473395238650571, "kl": 0.029083251953125, "learning_rate": 2.069112676548053e-07, "loss": 0.1053, "num_tokens": 2054066987.0, "reward": 2.4559152126312256, "reward_std": 0.45777302980422974, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14869897067546844, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 862.0379638671875, "completions/mean_terminated_length": 732.873779296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7988919077300091, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15094615359444893, "kl": 0.032379150390625, "learning_rate": 2.0669474118035362e-07, "loss": 0.1065, "num_tokens": 2054519020.0, "reward": 2.5814733505249023, "reward_std": 0.39167726039886475, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11828288435935974, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 905.6183471679688, "completions/mean_terminated_length": 781.2005004882812, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.799105002397315, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12552220695252464, "kl": 0.028900146484375, "learning_rate": 2.064784046990849e-07, "loss": 0.0725, "num_tokens": 2054987409.0, "reward": 2.5200893878936768, "reward_std": 0.4024791121482849, "rewards/accuracy_reward/mean": 0.6087962985038757, "rewards/accuracy_reward/std": 0.4885856807231903, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11409792304039001, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 979.8147583007812, "completions/mean_terminated_length": 778.64453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.799318097064621, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13602199327134673, "kl": 0.027252197265625, "learning_rate": 2.0626225833072487e-07, "loss": 0.116, "num_tokens": 2055498014.0, "reward": 2.3482143878936768, "reward_std": 0.4339216947555542, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634626507759094, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 909.357177734375, "completions/mean_terminated_length": 779.0646362304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7995311917319269, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12998688527229676, "kl": 0.029022216796875, "learning_rate": 2.0604630219489379e-07, "loss": 0.0567, "num_tokens": 2055974526.0, "reward": 2.5083706378936768, "reward_std": 0.32721710205078125, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09830980002880096, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 902.3170166015625, "completions/mean_terminated_length": 735.2992553710938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7997442863992329, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13104773340760534, "kl": 0.02886962890625, "learning_rate": 2.0583053641110736e-07, "loss": 0.1113, "num_tokens": 2056448396.0, "reward": 2.4966518878936768, "reward_std": 0.4584699869155884, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1464626044034958, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 963.2366333007812, "completions/mean_terminated_length": 745.1206665039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7999573810665388, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.11731660996010555, "kl": 0.029205322265625, "learning_rate": 2.0561496109877492e-07, "loss": 0.0644, "num_tokens": 2056943782.0, "reward": 2.46484375, "reward_std": 0.279685914516449, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.19128035008907318, "rewards/tag_count_reward/mean": 0.9871651530265808, "rewards/tag_count_reward/std": 0.08991296589374542, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 859.9710083007812, "completions/mean_terminated_length": 693.7073974609375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8001704757338448, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14311185883933064, "kl": 0.032501220703125, "learning_rate": 2.0539957637720095e-07, "loss": 0.0686, "num_tokens": 2057395673.0, "reward": 2.532924175262451, "reward_std": 0.42804330587387085, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11614610999822617, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1106.4285888671875, "completions/mean_terminated_length": 788.8239135742188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8003835704011507, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12064399780381382, "kl": 0.0279541015625, "learning_rate": 2.0518438236558466e-07, "loss": 0.0731, "num_tokens": 2057966681.0, "reward": 2.3660714626312256, "reward_std": 0.44057387113571167, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15527118742465973, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 997.5647583007812, "completions/mean_terminated_length": 744.4127197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8005966650684566, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11527335610857593, "kl": 0.0279541015625, "learning_rate": 2.049693791830194e-07, "loss": 0.0543, "num_tokens": 2058477830.0, "reward": 2.4520089626312256, "reward_std": 0.4004580080509186, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13351379334926605, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1049.024658203125, "completions/mean_terminated_length": 769.3114013671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8008097597357626, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12610399470655284, "kl": 0.02801513671875, "learning_rate": 2.047545669484929e-07, "loss": 0.0915, "num_tokens": 2059018769.0, "reward": 2.4095983505249023, "reward_std": 0.42340442538261414, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9497767686843872, "rewards/tag_count_reward/std": 0.18543121218681335, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 981.9063110351562, "completions/mean_terminated_length": 735.8846435546875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8010228544030685, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1179028770441931, "kl": 0.02679443359375, "learning_rate": 2.045399457808873e-07, "loss": 0.0703, "num_tokens": 2059528023.0, "reward": 2.3705358505249023, "reward_std": 0.3360064923763275, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967317014932632, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1050.5513916015625, "completions/mean_terminated_length": 789.2478637695312, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8012359490703745, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1225193369370279, "kl": 0.027923583984375, "learning_rate": 2.0432551579897888e-07, "loss": 0.1017, "num_tokens": 2060066718.0, "reward": 2.4135046005249023, "reward_std": 0.4386962950229645, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.1758127510547638, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1077.790283203125, "completions/mean_terminated_length": 843.9722900390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8014490437376804, "frac_reward_zero_std": 0.0, "grad_norm": 0.128495007324455, "kl": 0.024017333984375, "learning_rate": 2.041112771214386e-07, "loss": 0.084, "num_tokens": 2060621456.0, "reward": 2.4268975257873535, "reward_std": 0.48979952931404114, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15371057391166687, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1129.2098388671875, "completions/mean_terminated_length": 837.3588256835938, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.8016621384049865, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11432182495592118, "kl": 0.023193359375, "learning_rate": 2.0389722986683062e-07, "loss": 0.074, "num_tokens": 2061196302.0, "reward": 2.3800225257873535, "reward_std": 0.3595236539840698, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10854540765285492, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 905.8214721679688, "completions/mean_terminated_length": 742.653076171875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8018752330722924, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13336266037123845, "kl": 0.030029296875, "learning_rate": 2.0368337415361413e-07, "loss": 0.0338, "num_tokens": 2061666654.0, "reward": 2.5245537757873535, "reward_std": 0.3657456934452057, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119430512189865, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 968.5938110351562, "completions/mean_terminated_length": 751.5549926757812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8020883277395983, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13204619205709126, "kl": 0.030609130859375, "learning_rate": 2.0346971010014178e-07, "loss": 0.0498, "num_tokens": 2062170936.0, "reward": 2.5, "reward_std": 0.45575597882270813, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13422717154026031, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1094.274658203125, "completions/mean_terminated_length": 780.1394653320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8023014224069043, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1341447193300278, "kl": 0.025146484375, "learning_rate": 2.0325623782466026e-07, "loss": 0.0405, "num_tokens": 2062735555.0, "reward": 2.33984375, "reward_std": 0.42057469487190247, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.151995450258255, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1005.622802734375, "completions/mean_terminated_length": 757.9862060546875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8025145170742102, "frac_reward_zero_std": 0.25, "grad_norm": 0.11478078446376976, "kl": 0.025482177734375, "learning_rate": 2.0304295744531013e-07, "loss": 0.0367, "num_tokens": 2063259530.0, "reward": 2.3900671005249023, "reward_std": 0.369486540555954, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1377149522304535, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 963.0647583007812, "completions/mean_terminated_length": 785.5298461914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8027276117415162, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14583690282262873, "kl": 0.030059814453125, "learning_rate": 2.0282986908012566e-07, "loss": 0.0948, "num_tokens": 2063761879.0, "reward": 2.4536831378936768, "reward_std": 0.3953183591365814, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11638233810663223, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 957.21435546875, "completions/mean_terminated_length": 716.4686279296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8029407064088221, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12664396139153772, "kl": 0.029876708984375, "learning_rate": 2.0261697284703512e-07, "loss": 0.0497, "num_tokens": 2064258215.0, "reward": 2.41796875, "reward_std": 0.41329115629196167, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14299829304218292, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 890.4375610351562, "completions/mean_terminated_length": 770.6896362304688, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8031538010761281, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14014581603126045, "kl": 0.03125, "learning_rate": 2.0240426886386025e-07, "loss": 0.1194, "num_tokens": 2064725467.0, "reward": 2.5161831378936768, "reward_std": 0.4407576322555542, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151519536972046, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1073.1942138671875, "completions/mean_terminated_length": 807.3380737304688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.803366895743434, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12499879519792503, "kl": 0.026611328125, "learning_rate": 2.0219175724831637e-07, "loss": 0.1464, "num_tokens": 2065278130.0, "reward": 2.4347100257873535, "reward_std": 0.4788230061531067, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13826683163642883, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 886.6027221679688, "completions/mean_terminated_length": 678.7737426757812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.80357999041074, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1354053015535307, "kl": 0.03265380859375, "learning_rate": 2.0197943811801237e-07, "loss": 0.0521, "num_tokens": 2065739056.0, "reward": 2.5301339626312256, "reward_std": 0.45878246426582336, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15822990238666534, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 966.8192138671875, "completions/mean_terminated_length": 756.3493041992188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8037930850780459, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12846976093896617, "kl": 0.0279541015625, "learning_rate": 2.0176731159045073e-07, "loss": 0.0171, "num_tokens": 2066242223.0, "reward": 2.4614956378936768, "reward_std": 0.4015711545944214, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11582320928573608, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 855.6339721679688, "completions/mean_terminated_length": 692.2131958007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8040061797453518, "frac_reward_zero_std": 0.0, "grad_norm": 0.15037841024694723, "kl": 0.034423828125, "learning_rate": 2.015553777830271e-07, "loss": 0.1134, "num_tokens": 2066687947.0, "reward": 2.548549175262451, "reward_std": 0.48142457008361816, "rewards/accuracy_reward/mean": 0.6921296119689941, "rewards/accuracy_reward/std": 0.4621478021144867, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15667887032032013, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 961.04248046875, "completions/mean_terminated_length": 773.2434692382812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8042192744126578, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14416007855740529, "kl": 0.027984619140625, "learning_rate": 2.0134363681303065e-07, "loss": 0.0369, "num_tokens": 2067183550.0, "reward": 2.4815850257873535, "reward_std": 0.4316096603870392, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14275363087654114, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 841.3170166015625, "completions/mean_terminated_length": 748.4952392578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8044323690799637, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13318427695070756, "kl": 0.03411865234375, "learning_rate": 2.0113208879764394e-07, "loss": 0.0404, "num_tokens": 2067620428.0, "reward": 2.654017925262451, "reward_std": 0.3979220688343048, "rewards/accuracy_reward/mean": 0.7321428656578064, "rewards/accuracy_reward/std": 0.4433377683162689, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11286582797765732, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 897.1563110351562, "completions/mean_terminated_length": 729.3861694335938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8046454637472698, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13125331870450319, "kl": 0.031707763671875, "learning_rate": 2.0092073385394266e-07, "loss": 0.0827, "num_tokens": 2068087010.0, "reward": 2.544642925262451, "reward_std": 0.43339887261390686, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14383502304553986, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1071.290283203125, "completions/mean_terminated_length": 829.1531982421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8048585584145757, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12188286150869962, "kl": 0.026397705078125, "learning_rate": 2.007095720988952e-07, "loss": 0.0879, "num_tokens": 2068637348.0, "reward": 2.4888393878936768, "reward_std": 0.40402692556381226, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09910815209150314, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1008.0156860351562, "completions/mean_terminated_length": 760.9475708007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8050716530818817, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1325395845940211, "kl": 0.029388427734375, "learning_rate": 2.004986036493638e-07, "loss": 0.0633, "num_tokens": 2069155739.0, "reward": 2.552455425262451, "reward_std": 0.4079567492008209, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969305515289307, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1302192211151123, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 984.6920166015625, "completions/mean_terminated_length": 774.3048095703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8052847477491876, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13944983256053214, "kl": 0.030731201171875, "learning_rate": 2.0028782862210312e-07, "loss": 0.0649, "num_tokens": 2069666657.0, "reward": 2.4073662757873535, "reward_std": 0.44240111112594604, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1253739446401596, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 899.700927734375, "completions/mean_terminated_length": 732.3017578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8054978424164936, "frac_reward_zero_std": 0.25, "grad_norm": 0.1402763108155338, "kl": 0.0330810546875, "learning_rate": 2.0007724713376135e-07, "loss": 0.0752, "num_tokens": 2070135387.0, "reward": 2.5652902126312256, "reward_std": 0.3437145948410034, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11682131141424179, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1046.575927734375, "completions/mean_terminated_length": 743.8197631835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8057109370837995, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12625194361251593, "kl": 0.031646728515625, "learning_rate": 1.9986685930087872e-07, "loss": 0.0547, "num_tokens": 2070676941.0, "reward": 2.325892925262451, "reward_std": 0.5002307295799255, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.18284475803375244, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1056.372802734375, "completions/mean_terminated_length": 817.393310546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8059240317511054, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10443118761932445, "kl": 0.028076171875, "learning_rate": 1.996566652398892e-07, "loss": 0.0456, "num_tokens": 2071212724.0, "reward": 2.4012277126312256, "reward_std": 0.38326969742774963, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692909002304, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 963.638427734375, "completions/mean_terminated_length": 824.3375244140625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8061371264184114, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13064643936662915, "kl": 0.02838134765625, "learning_rate": 1.99446665067119e-07, "loss": 0.0824, "num_tokens": 2071714354.0, "reward": 2.4447546005249023, "reward_std": 0.4725989103317261, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1514935940504074, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1044.171875, "completions/mean_terminated_length": 842.3297729492188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8063502210857173, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.17672283076157, "kl": 0.030242919921875, "learning_rate": 1.9923685889878707e-07, "loss": 0.0736, "num_tokens": 2072258447.0, "reward": 2.4302456378936768, "reward_std": 0.457394003868103, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 992.1942138671875, "completions/mean_terminated_length": 816.2265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8065633157530233, "frac_reward_zero_std": 0.0, "grad_norm": 0.14480298592758234, "kl": 0.028350830078125, "learning_rate": 1.9902724685100513e-07, "loss": 0.1274, "num_tokens": 2072772726.0, "reward": 2.392857313156128, "reward_std": 0.49510669708251953, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1145.078125, "completions/mean_terminated_length": 875.5101928710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8067764104203292, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10775098600232735, "kl": 0.022796630859375, "learning_rate": 1.988178290397773e-07, "loss": 0.057, "num_tokens": 2073352617.0, "reward": 2.3973214626312256, "reward_std": 0.40405386686325073, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11531686037778854, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1096.6629638671875, "completions/mean_terminated_length": 826.7994384765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8069895050876352, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12103206805729559, "kl": 0.026824951171875, "learning_rate": 1.9860860558100057e-07, "loss": 0.1071, "num_tokens": 2073912722.0, "reward": 2.328125, "reward_std": 0.3918082118034363, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13058780133724213, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 965.99560546875, "completions/mean_terminated_length": 741.4285278320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8072025997549411, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13517550430363887, "kl": 0.02764892578125, "learning_rate": 1.9839957659046386e-07, "loss": 0.0859, "num_tokens": 2074418560.0, "reward": 2.513392925262451, "reward_std": 0.40774139761924744, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10688954591751099, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1029.2054443359375, "completions/mean_terminated_length": 790.6446533203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.807415694422247, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12544393496048453, "kl": 0.027374267578125, "learning_rate": 1.9819074218384865e-07, "loss": 0.0619, "num_tokens": 2074940732.0, "reward": 2.4268975257873535, "reward_std": 0.4089321196079254, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16824373602867126, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1122.4375, "completions/mean_terminated_length": 810.2328491210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.807628789089553, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.134705934958554, "kl": 0.02264404296875, "learning_rate": 1.9798210247672907e-07, "loss": 0.0937, "num_tokens": 2075513728.0, "reward": 2.3699777126312256, "reward_std": 0.48972755670547485, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549686908722, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1475696861743927, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 961.1942138671875, "completions/mean_terminated_length": 756.5172119140625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.807841883756859, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1309955525681121, "kl": 0.03009033203125, "learning_rate": 1.977736575845711e-07, "loss": 0.0251, "num_tokens": 2076013079.0, "reward": 2.4486608505249023, "reward_std": 0.43551793694496155, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13840332627296448, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 956.6004638671875, "completions/mean_terminated_length": 733.6263427734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.808054978424165, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13587869737708755, "kl": 0.029571533203125, "learning_rate": 1.97565407622733e-07, "loss": 0.0704, "num_tokens": 2076513412.0, "reward": 2.435267925262451, "reward_std": 0.30990543961524963, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08213625103235245, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1082.7679443359375, "completions/mean_terminated_length": 826.4632568359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8082680730914709, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1440912242219125, "kl": 0.02459716796875, "learning_rate": 1.973573527064651e-07, "loss": 0.0708, "num_tokens": 2077073244.0, "reward": 2.341517925262451, "reward_std": 0.4613869786262512, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1826944798231125, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1040.337158203125, "completions/mean_terminated_length": 837.723876953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8084811677587769, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1355983875506404, "kl": 0.0264892578125, "learning_rate": 1.971494929509101e-07, "loss": 0.0832, "num_tokens": 2077613507.0, "reward": 2.4341518878936768, "reward_std": 0.4743957817554474, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1401556432247162, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1027.59375, "completions/mean_terminated_length": 798.9781494140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8086942624260828, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13451323027148585, "kl": 0.0263671875, "learning_rate": 1.9694182847110247e-07, "loss": 0.1089, "num_tokens": 2078144109.0, "reward": 2.439732313156128, "reward_std": 0.4938867688179016, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16231535375118256, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 885.0156860351562, "completions/mean_terminated_length": 722.2570190429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8089073570933888, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14898892363526484, "kl": 0.034088134765625, "learning_rate": 1.9673435938196826e-07, "loss": 0.0503, "num_tokens": 2078614548.0, "reward": 2.5552456378936768, "reward_std": 0.3411842882633209, "rewards/accuracy_reward/mean": 0.6620370149612427, "rewards/accuracy_reward/std": 0.4735642969608307, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1066.857177734375, "completions/mean_terminated_length": 833.7680053710938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8091204517606947, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12697935977043914, "kl": 0.027069091796875, "learning_rate": 1.9652708579832605e-07, "loss": 0.0796, "num_tokens": 2079159972.0, "reward": 2.35546875, "reward_std": 0.472360223531723, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1094.171875, "completions/mean_terminated_length": 874.0577392578125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8093335464280006, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1283896540380711, "kl": 0.024658203125, "learning_rate": 1.963200078348858e-07, "loss": 0.1107, "num_tokens": 2079719041.0, "reward": 2.4017858505249023, "reward_std": 0.4016064405441284, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 948.2120971679688, "completions/mean_terminated_length": 761.56396484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8095466410953066, "frac_reward_zero_std": 0.0, "grad_norm": 0.12906811895578327, "kl": 0.029541015625, "learning_rate": 1.961131256062494e-07, "loss": 0.0794, "num_tokens": 2080214432.0, "reward": 2.4760046005249023, "reward_std": 0.46407073736190796, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764793753623962, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1190.555908203125, "completions/mean_terminated_length": 931.3284912109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8097597357626125, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1190227112544496, "kl": 0.023834228515625, "learning_rate": 1.9590643922691013e-07, "loss": 0.0928, "num_tokens": 2080821673.0, "reward": 2.4029018878936768, "reward_std": 0.5205156207084656, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.17233900725841522, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1079.977783203125, "completions/mean_terminated_length": 794.60693359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8099728304299185, "frac_reward_zero_std": 0.0, "grad_norm": 0.12316714583750477, "kl": 0.025665283203125, "learning_rate": 1.9569994881125317e-07, "loss": 0.0857, "num_tokens": 2081381087.0, "reward": 2.3911831378936768, "reward_std": 0.5174334049224854, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764793753623962, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1106.078125, "completions/mean_terminated_length": 824.86669921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8101859250972244, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.09862052381771616, "kl": 0.023284912109375, "learning_rate": 1.9549365447355527e-07, "loss": 0.0315, "num_tokens": 2081945298.0, "reward": 2.392857313156128, "reward_std": 0.3353942334651947, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13106489181518555, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1008.7210083007812, "completions/mean_terminated_length": 758.257568359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8103990197645304, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13738314807664986, "kl": 0.028472900390625, "learning_rate": 1.9528755632798444e-07, "loss": 0.0859, "num_tokens": 2082464933.0, "reward": 2.4268975257873535, "reward_std": 0.48533666133880615, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.17139743268489838, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 995.9085083007812, "completions/mean_terminated_length": 745.964111328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8106121144318363, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12508686190014665, "kl": 0.026458740234375, "learning_rate": 1.9508165448860025e-07, "loss": 0.0544, "num_tokens": 2082980460.0, "reward": 2.4681921005249023, "reward_std": 0.38320353627204895, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1226801648736, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 958.0402221679688, "completions/mean_terminated_length": 776.3802490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8108252090991422, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1465439941462836, "kl": 0.032501220703125, "learning_rate": 1.9487594906935355e-07, "loss": 0.0513, "num_tokens": 2083486222.0, "reward": 2.4034600257873535, "reward_std": 0.4473932981491089, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1091.091552734375, "completions/mean_terminated_length": 801.7935791015625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8110383037664483, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.35775991035324856, "kl": 0.02960205078125, "learning_rate": 1.9467044018408685e-07, "loss": 0.0864, "num_tokens": 2084044039.0, "reward": 2.248326063156128, "reward_std": 0.46287810802459717, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9469866156578064, "rewards/tag_count_reward/std": 0.18043838441371918, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 979.4442138671875, "completions/mean_terminated_length": 781.5634765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8112513984337542, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13152068794394575, "kl": 0.02734375, "learning_rate": 1.9446512794653322e-07, "loss": 0.1027, "num_tokens": 2084550174.0, "reward": 2.44140625, "reward_std": 0.4648064970970154, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14801737666130066, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1107.6295166015625, "completions/mean_terminated_length": 861.2788696289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8114644931010602, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11873667335394705, "kl": 0.024810791015625, "learning_rate": 1.9426001247031725e-07, "loss": 0.0909, "num_tokens": 2085114008.0, "reward": 2.353236675262451, "reward_std": 0.4628064036369324, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1025.7076416015625, "completions/mean_terminated_length": 800.0789794921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8116775877683661, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1253862565931331, "kl": 0.026947021484375, "learning_rate": 1.9405509386895492e-07, "loss": 0.0434, "num_tokens": 2085648197.0, "reward": 2.458705425262451, "reward_std": 0.3712116777896881, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.10495071113109589, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1022.1339721679688, "completions/mean_terminated_length": 828.9336547851562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8118906824356721, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13963517370526465, "kl": 0.0286865234375, "learning_rate": 1.9385037225585292e-07, "loss": 0.1263, "num_tokens": 2086175553.0, "reward": 2.5318081378936768, "reward_std": 0.4854374825954437, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.17038200795650482, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1065.399658203125, "completions/mean_terminated_length": 821.8021850585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.812103777102978, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12967905736453128, "kl": 0.02593994140625, "learning_rate": 1.9364584774430893e-07, "loss": 0.0785, "num_tokens": 2086721668.0, "reward": 2.4765625, "reward_std": 0.403864324092865, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316954612732, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11589459329843521, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 840.1473388671875, "completions/mean_terminated_length": 638.8385620117188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.812316871770284, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1346870635567144, "kl": 0.031005859375, "learning_rate": 1.9344152044751162e-07, "loss": 0.0816, "num_tokens": 2087164102.0, "reward": 2.5630581378936768, "reward_std": 0.41586530208587646, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13381615281105042, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1018.7745971679688, "completions/mean_terminated_length": 811.8257446289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8125299664375899, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13519956505613137, "kl": 0.0267333984375, "learning_rate": 1.932373904785404e-07, "loss": 0.043, "num_tokens": 2087688817.0, "reward": 2.3387277126312256, "reward_std": 0.40027564764022827, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1293378323316574, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1010.2053833007812, "completions/mean_terminated_length": 781.1552734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8127430611048958, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12082378955727528, "kl": 0.026824951171875, "learning_rate": 1.9303345795036596e-07, "loss": 0.0559, "num_tokens": 2088212653.0, "reward": 2.3286831378936768, "reward_std": 0.3728073835372925, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14963631331920624, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1005.8772583007812, "completions/mean_terminated_length": 816.1504516601562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8129561557722018, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13111701838697298, "kl": 0.028076171875, "learning_rate": 1.9282972297584883e-07, "loss": 0.0779, "num_tokens": 2088735606.0, "reward": 2.502232313156128, "reward_std": 0.4124475419521332, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09438535571098328, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 989.1027221679688, "completions/mean_terminated_length": 815.8285522460938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8131692504395077, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1265401464176991, "kl": 0.02935791015625, "learning_rate": 1.926261856677411e-07, "loss": 0.0619, "num_tokens": 2089244068.0, "reward": 2.357701063156128, "reward_std": 0.37576520442962646, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11440251022577286, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1001.5803833007812, "completions/mean_terminated_length": 852.091796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8133823451068137, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12600557532524928, "kl": 0.028045654296875, "learning_rate": 1.9242284613868492e-07, "loss": 0.0532, "num_tokens": 2089768232.0, "reward": 2.4520089626312256, "reward_std": 0.4262053668498993, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12473504990339279, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 904.3817138671875, "completions/mean_terminated_length": 776.682373046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8135954397741196, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13995609731510578, "kl": 0.030792236328125, "learning_rate": 1.9221970450121324e-07, "loss": 0.095, "num_tokens": 2090248515.0, "reward": 2.52734375, "reward_std": 0.4384230375289917, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.15014436841011047, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 832.2745971679688, "completions/mean_terminated_length": 689.7830810546875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8138085344414256, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13702210624351652, "kl": 0.031768798828125, "learning_rate": 1.920167608677494e-07, "loss": 0.0594, "num_tokens": 2090690734.0, "reward": 2.625, "reward_std": 0.35233721137046814, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45739173889160156, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10517053306102753, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 882.68310546875, "completions/mean_terminated_length": 746.0997924804688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8140216291087315, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1420290558236536, "kl": 0.02996826171875, "learning_rate": 1.91814015350607e-07, "loss": 0.0699, "num_tokens": 2091156784.0, "reward": 2.4575893878936768, "reward_std": 0.46678900718688965, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14942027628421783, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 992.4553833007812, "completions/mean_terminated_length": 793.665771484375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8142347237760375, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12768523191439335, "kl": 0.028717041015625, "learning_rate": 1.916114680619904e-07, "loss": 0.0663, "num_tokens": 2091669836.0, "reward": 2.4581475257873535, "reward_std": 0.47829800844192505, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.49405214190483093, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1116.87060546875, "completions/mean_terminated_length": 866.2832641601562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8144478184433435, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.112577832189833, "kl": 0.0240478515625, "learning_rate": 1.9140911911399393e-07, "loss": 0.0324, "num_tokens": 2092244834.0, "reward": 2.416294813156128, "reward_std": 0.36861851811408997, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12405261397361755, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1042.1429443359375, "completions/mean_terminated_length": 823.478271484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8146609131106494, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12958577892049164, "kl": 0.027679443359375, "learning_rate": 1.9120696861860226e-07, "loss": 0.0466, "num_tokens": 2092777106.0, "reward": 2.334263563156128, "reward_std": 0.429955393075943, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 944.2120971679688, "completions/mean_terminated_length": 711.5216674804688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8148740077779554, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15337722624225922, "kl": 0.030548095703125, "learning_rate": 1.910050166876902e-07, "loss": 0.0624, "num_tokens": 2093262737.0, "reward": 2.455357313156128, "reward_std": 0.4916169047355652, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16768723726272583, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 961.2902221679688, "completions/mean_terminated_length": 821.6876220703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8150871024452613, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1432327153351817, "kl": 0.0289306640625, "learning_rate": 1.908032634330227e-07, "loss": 0.0737, "num_tokens": 2093765091.0, "reward": 2.5541296005249023, "reward_std": 0.41940391063690186, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.11946304142475128, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1093.904052734375, "completions/mean_terminated_length": 857.3732299804688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8153001971125673, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12096446403567682, "kl": 0.0247802734375, "learning_rate": 1.9060170896625466e-07, "loss": 0.0702, "num_tokens": 2094329544.0, "reward": 2.3275671005249023, "reward_std": 0.4879489541053772, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.1706821471452713, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1091.4420166015625, "completions/mean_terminated_length": 844.2415771484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8155132917798732, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11362024005693917, "kl": 0.024017333984375, "learning_rate": 1.9040035339893102e-07, "loss": 0.0789, "num_tokens": 2094887486.0, "reward": 2.3761162757873535, "reward_std": 0.3883502185344696, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11987641453742981, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1126.7388916015625, "completions/mean_terminated_length": 904.7174072265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8157263864471792, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12341640906879822, "kl": 0.02490234375, "learning_rate": 1.9019919684248689e-07, "loss": 0.0844, "num_tokens": 2095457529.0, "reward": 2.3465402126312256, "reward_std": 0.45748406648635864, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822286784648895, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15223345160484314, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 980.8058471679688, "completions/mean_terminated_length": 809.3911743164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8159394811144851, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13535965508258643, "kl": 0.028900146484375, "learning_rate": 1.8999823940824688e-07, "loss": 0.134, "num_tokens": 2095962498.0, "reward": 2.447544813156128, "reward_std": 0.47021356225013733, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14886364340782166, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 881.7277221679688, "completions/mean_terminated_length": 731.9042358398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.816152575781791, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1361864352144198, "kl": 0.034271240234375, "learning_rate": 1.8979748120742562e-07, "loss": 0.0689, "num_tokens": 2096423960.0, "reward": 2.5597100257873535, "reward_std": 0.459262877702713, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.4769369065761566, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 952.4464721679688, "completions/mean_terminated_length": 783.0308837890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.816365670449097, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11655359659526496, "kl": 0.028778076171875, "learning_rate": 1.8959692235112735e-07, "loss": 0.083, "num_tokens": 2096920000.0, "reward": 2.4441964626312256, "reward_std": 0.4263933300971985, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1056.8170166015625, "completions/mean_terminated_length": 844.6124877929688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8165787651164029, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1150665711979237, "kl": 0.025726318359375, "learning_rate": 1.8939656295034588e-07, "loss": 0.0365, "num_tokens": 2097469118.0, "reward": 2.357142925262451, "reward_std": 0.4608336091041565, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14238695800304413, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 897.10498046875, "completions/mean_terminated_length": 725.9461669921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8167918597837089, "frac_reward_zero_std": 0.25, "grad_norm": 0.1281683491048, "kl": 0.030914306640625, "learning_rate": 1.891964031159653e-07, "loss": 0.0299, "num_tokens": 2097941821.0, "reward": 2.5106027126312256, "reward_std": 0.3279326558113098, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.09791535139083862, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 986.4576416015625, "completions/mean_terminated_length": 762.6729736328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8170049544510148, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3264248516741544, "kl": 0.0328369140625, "learning_rate": 1.8899644295875815e-07, "loss": 0.0799, "num_tokens": 2098454202.0, "reward": 2.493861675262451, "reward_std": 0.48603954911231995, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12892211973667145, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 902.325927734375, "completions/mean_terminated_length": 748.6025390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8172180491183209, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14020973383759344, "kl": 0.03302001953125, "learning_rate": 1.887966825893875e-07, "loss": 0.0596, "num_tokens": 2098921756.0, "reward": 2.615513563156128, "reward_std": 0.40775319933891296, "rewards/accuracy_reward/mean": 0.7098214030265808, "rewards/accuracy_reward/std": 0.4543519914150238, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1113.2545166015625, "completions/mean_terminated_length": 837.693603515625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8174311437856268, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.106747157000383, "kl": 0.023712158203125, "learning_rate": 1.8859712211840522e-07, "loss": 0.0357, "num_tokens": 2099490926.0, "reward": 2.369419813156128, "reward_std": 0.37991073727607727, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1479213982820511, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1094.247802734375, "completions/mean_terminated_length": 890.0569458007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8176442384529328, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2685312444455439, "kl": 0.02783203125, "learning_rate": 1.8839776165625288e-07, "loss": 0.0968, "num_tokens": 2100053341.0, "reward": 2.1746652126312256, "reward_std": 0.48554307222366333, "rewards/accuracy_reward/mean": 0.35648149251937866, "rewards/accuracy_reward/std": 0.47951504588127136, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9380580186843872, "rewards/tag_count_reward/std": 0.2018849104642868, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1059.265625, "completions/mean_terminated_length": 807.2352905273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8178573331202387, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12807251681719428, "kl": 0.027923583984375, "learning_rate": 1.8819860131326114e-07, "loss": 0.1079, "num_tokens": 2100598772.0, "reward": 2.3197546005249023, "reward_std": 0.507283091545105, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14529338479042053, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 916.5960083007812, "completions/mean_terminated_length": 748.3359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8180704277875446, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.130171173202459, "kl": 0.028717041015625, "learning_rate": 1.8799964119964994e-07, "loss": 0.0373, "num_tokens": 2101076655.0, "reward": 2.53125, "reward_std": 0.3690985441207886, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11566276848316193, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 861.3214721679688, "completions/mean_terminated_length": 698.68017578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8182835224548506, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13289740377845502, "kl": 0.02978515625, "learning_rate": 1.878008814255287e-07, "loss": 0.0695, "num_tokens": 2101534335.0, "reward": 2.5078125, "reward_std": 0.3831934630870819, "rewards/accuracy_reward/mean": 0.6226851940155029, "rewards/accuracy_reward/std": 0.4852766990661621, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15391045808792114, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 985.1585083007812, "completions/mean_terminated_length": 761.1000366210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8184966171221565, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1325864013246796, "kl": 0.02691650390625, "learning_rate": 1.8760232210089549e-07, "loss": 0.0748, "num_tokens": 2102050614.0, "reward": 2.3777902126312256, "reward_std": 0.481874018907547, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15941192209720612, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 981.9754638671875, "completions/mean_terminated_length": 787.8971557617188, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.8187097117894625, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13756733492060325, "kl": 0.027069091796875, "learning_rate": 1.8740396333563797e-07, "loss": 0.0651, "num_tokens": 2102563339.0, "reward": 2.4486608505249023, "reward_std": 0.3990635275840759, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10688954591751099, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1038.0179443359375, "completions/mean_terminated_length": 828.39892578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8189228064567684, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13069393468803966, "kl": 0.02618408203125, "learning_rate": 1.872058052395323e-07, "loss": 0.0775, "num_tokens": 2103093475.0, "reward": 2.439732313156128, "reward_std": 0.43193283677101135, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 974.6808471679688, "completions/mean_terminated_length": 799.0467529296875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8191359011240744, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12573697747410711, "kl": 0.02899169921875, "learning_rate": 1.8700784792224394e-07, "loss": 0.0526, "num_tokens": 2103596308.0, "reward": 2.3588171005249023, "reward_std": 0.46852564811706543, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 979.5067138671875, "completions/mean_terminated_length": 778.2785034179688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8193489957913803, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1338155037707908, "kl": 0.0306396484375, "learning_rate": 1.8681009149332708e-07, "loss": 0.1056, "num_tokens": 2104101607.0, "reward": 2.517857313156128, "reward_std": 0.4564667046070099, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12733516097068787, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1017.8326416015625, "completions/mean_terminated_length": 776.6088256835938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8195620904586862, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12011659044037523, "kl": 0.02557373046875, "learning_rate": 1.8661253606222462e-07, "loss": 0.1024, "num_tokens": 2104638236.0, "reward": 2.372767925262451, "reward_std": 0.4167993664741516, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16370917856693268, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1125.3460693359375, "completions/mean_terminated_length": 924.76904296875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8197751851259922, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11385795383098586, "kl": 0.0235595703125, "learning_rate": 1.8641518173826858e-07, "loss": 0.0296, "num_tokens": 2105209591.0, "reward": 2.364955425262451, "reward_std": 0.44534406065940857, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13911856710910797, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1088.52685546875, "completions/mean_terminated_length": 826.852294921875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8199882797932981, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.124590886487887, "kl": 0.0242919921875, "learning_rate": 1.8621802863067959e-07, "loss": 0.0939, "num_tokens": 2105775107.0, "reward": 2.3325893878936768, "reward_std": 0.4274972379207611, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15048591792583466, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 937.8035888671875, "completions/mean_terminated_length": 762.8114013671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8202013744606041, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15348456023486645, "kl": 0.030029296875, "learning_rate": 1.8602107684856637e-07, "loss": 0.1029, "num_tokens": 2106256779.0, "reward": 2.5267858505249023, "reward_std": 0.49287956953048706, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15220165252685547, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 975.6250610351562, "completions/mean_terminated_length": 812.9768676757812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.82041446912791, "frac_reward_zero_std": 0.25, "grad_norm": 0.11885162763176045, "kl": 0.028778076171875, "learning_rate": 1.8582432650092705e-07, "loss": 0.0552, "num_tokens": 2106769155.0, "reward": 2.40625, "reward_std": 0.3230324387550354, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005797147750854, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119429767131805, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 973.0156860351562, "completions/mean_terminated_length": 777.3060913085938, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8206275637952161, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12714781788741422, "kl": 0.029937744140625, "learning_rate": 1.8562777769664758e-07, "loss": 0.0888, "num_tokens": 2107279162.0, "reward": 2.4363839626312256, "reward_std": 0.4241209328174591, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12247265130281448, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 898.2098388671875, "completions/mean_terminated_length": 750.5037231445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.820840658462522, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14487028545693348, "kl": 0.03302001953125, "learning_rate": 1.8543143054450305e-07, "loss": 0.0826, "num_tokens": 2107748344.0, "reward": 2.4793527126312256, "reward_std": 0.43265241384506226, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13566918671131134, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 920.79248046875, "completions/mean_terminated_length": 766.302001953125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.821053753129828, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.2758949112052222, "kl": 0.029266357421875, "learning_rate": 1.8523528515315635e-07, "loss": 0.0425, "num_tokens": 2108227035.0, "reward": 2.533482313156128, "reward_std": 0.41881516575813293, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11923423409461975, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1000.5803833007812, "completions/mean_terminated_length": 806.6137084960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8212668477971339, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14576830671774685, "kl": 0.027130126953125, "learning_rate": 1.8503934163115875e-07, "loss": 0.0337, "num_tokens": 2108737039.0, "reward": 2.4296875, "reward_std": 0.47552403807640076, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15116052329540253, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 987.6920166015625, "completions/mean_terminated_length": 753.6730346679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8214799424644398, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12487412155197368, "kl": 0.02813720703125, "learning_rate": 1.8484360008695036e-07, "loss": 0.0381, "num_tokens": 2109249397.0, "reward": 2.45703125, "reward_std": 0.39589551091194153, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1091761440038681, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1027.29248046875, "completions/mean_terminated_length": 808.7669677734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8216930371317458, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1357954579969625, "kl": 0.02862548828125, "learning_rate": 1.8464806062885897e-07, "loss": 0.0881, "num_tokens": 2109779480.0, "reward": 2.5223214626312256, "reward_std": 0.3850313127040863, "rewards/accuracy_reward/mean": 0.6319444179534912, "rewards/accuracy_reward/std": 0.48283568024635315, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12449962645769119, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1087.3125, "completions/mean_terminated_length": 821.8233642578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8219061317990517, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1325053492479136, "kl": 0.02606201171875, "learning_rate": 1.844527233651007e-07, "loss": 0.1443, "num_tokens": 2110331076.0, "reward": 2.4252233505249023, "reward_std": 0.43619370460510254, "rewards/accuracy_reward/mean": 0.5532407164573669, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1717585027217865, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1008.4910888671875, "completions/mean_terminated_length": 835.2396240234375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8221192264663577, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13293971863473403, "kl": 0.027557373046875, "learning_rate": 1.842575884037797e-07, "loss": 0.0716, "num_tokens": 2110853616.0, "reward": 2.4849331378936768, "reward_std": 0.385657399892807, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14874933660030365, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1057.607177734375, "completions/mean_terminated_length": 812.0780029296875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8223323211336636, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13276222701829757, "kl": 0.029052734375, "learning_rate": 1.8406265585288856e-07, "loss": 0.0784, "num_tokens": 2111397232.0, "reward": 2.4369421005249023, "reward_std": 0.48796749114990234, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1579248607158661, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1024.22998046875, "completions/mean_terminated_length": 748.7110595703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8225454158009696, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13464395208980903, "kl": 0.0252685546875, "learning_rate": 1.8386792582030713e-07, "loss": 0.098, "num_tokens": 2111925767.0, "reward": 2.4503350257873535, "reward_std": 0.48645058274269104, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1114.263427734375, "completions/mean_terminated_length": 817.6647338867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8227585104682755, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1118619029596137, "kl": 0.024658203125, "learning_rate": 1.8367339841380365e-07, "loss": 0.0556, "num_tokens": 2112498557.0, "reward": 2.4140625, "reward_std": 0.36657649278640747, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266101151704788, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1176.888427734375, "completions/mean_terminated_length": 893.3905639648438, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.8229716051355814, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12059987794815479, "kl": 0.022308349609375, "learning_rate": 1.834790737410343e-07, "loss": 0.0481, "num_tokens": 2113104523.0, "reward": 2.2628350257873535, "reward_std": 0.4339313805103302, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15933358669281006, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1030.26123046875, "completions/mean_terminated_length": 812.3712768554688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8231846998028874, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1394780496726591, "kl": 0.02801513671875, "learning_rate": 1.8328495190954294e-07, "loss": 0.0973, "num_tokens": 2113633728.0, "reward": 2.3236608505249023, "reward_std": 0.49038365483283997, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.1637244075536728, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 930.4375610351562, "completions/mean_terminated_length": 780.486083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8233977944701933, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13991560733372568, "kl": 0.02880859375, "learning_rate": 1.83091033026761e-07, "loss": 0.1137, "num_tokens": 2114121252.0, "reward": 2.5306921005249023, "reward_std": 0.41868260502815247, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11418405920267105, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 952.4688110351562, "completions/mean_terminated_length": 753.0184936523438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8236108891374994, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1350422985019993, "kl": 0.0279541015625, "learning_rate": 1.8289731720000784e-07, "loss": 0.1227, "num_tokens": 2114621286.0, "reward": 2.4614956378936768, "reward_std": 0.4467255175113678, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14945264160633087, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1081.0179443359375, "completions/mean_terminated_length": 827.6957397460938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8238239838048053, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12892943630633458, "kl": 0.025634765625, "learning_rate": 1.8270380453649012e-07, "loss": 0.0497, "num_tokens": 2115170894.0, "reward": 2.404017925262451, "reward_std": 0.4098993241786957, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1300705373287201, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 933.4107666015625, "completions/mean_terminated_length": 774.1836547851562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8240370784721113, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12439047973757066, "kl": 0.0284423828125, "learning_rate": 1.8251049514330268e-07, "loss": 0.0543, "num_tokens": 2115653142.0, "reward": 2.584263563156128, "reward_std": 0.4098218083381653, "rewards/accuracy_reward/mean": 0.6696428656578064, "rewards/accuracy_reward/std": 0.47086748480796814, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11734377592802048, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 927.0000610351562, "completions/mean_terminated_length": 789.3333129882812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8242501731394172, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13672328599911926, "kl": 0.029754638671875, "learning_rate": 1.823173891274271e-07, "loss": 0.1029, "num_tokens": 2116144390.0, "reward": 2.505580425262451, "reward_std": 0.44610515236854553, "rewards/accuracy_reward/mean": 0.6412037014961243, "rewards/accuracy_reward/std": 0.48020341992378235, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1038.5625, "completions/mean_terminated_length": 838.834228515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8244632678067232, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12181813640928163, "kl": 0.02777099609375, "learning_rate": 1.8212448659573298e-07, "loss": 0.0398, "num_tokens": 2116685970.0, "reward": 2.51171875, "reward_std": 0.4328148365020752, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517839670181274, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1027.40185546875, "completions/mean_terminated_length": 802.1471557617188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8246763624740291, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1201401857596392, "kl": 0.02813720703125, "learning_rate": 1.8193178765497708e-07, "loss": -0.0019, "num_tokens": 2117219638.0, "reward": 2.4520089626312256, "reward_std": 0.3480514585971832, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11305922269821167, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1140.5, "completions/mean_terminated_length": 827.09912109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.824889457141335, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10831456889204942, "kl": 0.0234375, "learning_rate": 1.8173929241180347e-07, "loss": 0.0635, "num_tokens": 2117802742.0, "reward": 2.4213171005249023, "reward_std": 0.3452896475791931, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12540756165981293, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1064.90185546875, "completions/mean_terminated_length": 807.3577270507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.825102551808641, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12876231627715187, "kl": 0.027099609375, "learning_rate": 1.8154700097274365e-07, "loss": 0.1125, "num_tokens": 2118348618.0, "reward": 2.421875, "reward_std": 0.40664952993392944, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11244472116231918, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 993.5313110351562, "completions/mean_terminated_length": 839.8107299804688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.8253156464759469, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13353373629243176, "kl": 0.029144287109375, "learning_rate": 1.8135491344421592e-07, "loss": 0.0492, "num_tokens": 2118867896.0, "reward": 2.4681921005249023, "reward_std": 0.45001834630966187, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15306761860847473, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 928.6094360351562, "completions/mean_terminated_length": 745.4363403320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8255287411432529, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14160324242092542, "kl": 0.03265380859375, "learning_rate": 1.8116302993252636e-07, "loss": 0.115, "num_tokens": 2119353577.0, "reward": 2.424107313156128, "reward_std": 0.4820365309715271, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16000695526599884, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 972.888427734375, "completions/mean_terminated_length": 742.7154541015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8257418358105588, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12670570313110796, "kl": 0.026763916015625, "learning_rate": 1.8097135054386766e-07, "loss": 0.106, "num_tokens": 2119859287.0, "reward": 2.3426339626312256, "reward_std": 0.39663228392601013, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1591111123561859, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 924.7388916015625, "completions/mean_terminated_length": 767.5394287109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8259549304778648, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13285470215719886, "kl": 0.0301513671875, "learning_rate": 1.807798753843197e-07, "loss": 0.0997, "num_tokens": 2120344882.0, "reward": 2.529576063156128, "reward_std": 0.43511202931404114, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1053.046875, "completions/mean_terminated_length": 878.0813598632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8261680251451707, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12155558092311321, "kl": 0.02716064453125, "learning_rate": 1.8058860455984936e-07, "loss": 0.0712, "num_tokens": 2120891719.0, "reward": 2.4386162757873535, "reward_std": 0.42629992961883545, "rewards/accuracy_reward/mean": 0.5763888955116272, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.151918426156044, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 930.30810546875, "completions/mean_terminated_length": 773.8880615234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8263811198124766, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11684979459665312, "kl": 0.030426025390625, "learning_rate": 1.803975381763103e-07, "loss": 0.0823, "num_tokens": 2121378225.0, "reward": 2.521205425262451, "reward_std": 0.396239697933197, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10841450095176697, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 994.0558471679688, "completions/mean_terminated_length": 818.3984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8265942144797827, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12165412316368807, "kl": 0.027313232421875, "learning_rate": 1.8020667633944325e-07, "loss": 0.0322, "num_tokens": 2121897642.0, "reward": 2.368861675262451, "reward_std": 0.41608792543411255, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561823636293411, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 904.6250610351562, "completions/mean_terminated_length": 747.9187622070312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8268073091470886, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13388085119862358, "kl": 0.029296875, "learning_rate": 1.8001601915487545e-07, "loss": 0.0505, "num_tokens": 2122371074.0, "reward": 2.4888393878936768, "reward_std": 0.3655345141887665, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12782442569732666, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1013.9576416015625, "completions/mean_terminated_length": 782.286865234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8270204038143946, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12578737094431588, "kl": 0.02886962890625, "learning_rate": 1.798255667281213e-07, "loss": 0.0449, "num_tokens": 2122905439.0, "reward": 2.4609375, "reward_std": 0.40351077914237976, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1445406824350357, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 910.232177734375, "completions/mean_terminated_length": 751.0025634765625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8272334984817005, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13519484017282188, "kl": 0.02880859375, "learning_rate": 1.7963531916458152e-07, "loss": 0.0627, "num_tokens": 2123381959.0, "reward": 2.6138393878936768, "reward_std": 0.42811423540115356, "rewards/accuracy_reward/mean": 0.6964285969734192, "rewards/accuracy_reward/std": 0.4603137969970703, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1010.9085083007812, "completions/mean_terminated_length": 853.61181640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8274465931490065, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1404572799736099, "kl": 0.02880859375, "learning_rate": 1.794452765695436e-07, "loss": 0.1375, "num_tokens": 2123909374.0, "reward": 2.412388563156128, "reward_std": 0.47555533051490784, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1293378323316574, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 982.68310546875, "completions/mean_terminated_length": 811.5699462890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8276596878163124, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14423628699285146, "kl": 0.03118896484375, "learning_rate": 1.7925543904818151e-07, "loss": 0.1118, "num_tokens": 2124422320.0, "reward": 2.5106027126312256, "reward_std": 0.48660579323768616, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16055898368358612, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1037.466552734375, "completions/mean_terminated_length": 807.6740112304688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8278727824836184, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13426936768068357, "kl": 0.0296630859375, "learning_rate": 1.790658067055558e-07, "loss": 0.1053, "num_tokens": 2124952145.0, "reward": 2.4151787757873535, "reward_std": 0.5645465850830078, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.18115736544132233, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1094.7098388671875, "completions/mean_terminated_length": 791.9000244140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8280858771509243, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13075497930710098, "kl": 0.0252685546875, "learning_rate": 1.7887637964661367e-07, "loss": 0.0652, "num_tokens": 2125510991.0, "reward": 2.3348214626312256, "reward_std": 0.44865262508392334, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16516682505607605, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 885.7388916015625, "completions/mean_terminated_length": 702.5401000976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8282989718182302, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14894528813990038, "kl": 0.029815673828125, "learning_rate": 1.786871579761881e-07, "loss": 0.0564, "num_tokens": 2125973210.0, "reward": 2.4464287757873535, "reward_std": 0.4171983301639557, "rewards/accuracy_reward/mean": 0.5671296119689941, "rewards/accuracy_reward/std": 0.4960475564002991, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 886.3281860351562, "completions/mean_terminated_length": 733.7853393554688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8285120664855362, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14488186379711643, "kl": 0.030853271484375, "learning_rate": 1.784981417989991e-07, "loss": 0.0297, "num_tokens": 2126438637.0, "reward": 2.4129464626312256, "reward_std": 0.3363885283470154, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10110349208116531, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1064.8973388671875, "completions/mean_terminated_length": 854.4227905273438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8287251611528421, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11943081372714248, "kl": 0.0257568359375, "learning_rate": 1.7830933121965258e-07, "loss": 0.0862, "num_tokens": 2126990415.0, "reward": 2.46484375, "reward_std": 0.44837912917137146, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1463720053434372, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1013.41748046875, "completions/mean_terminated_length": 788.5081787109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8289382558201481, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11180027517430349, "kl": 0.0274658203125, "learning_rate": 1.7812072634264078e-07, "loss": 0.0209, "num_tokens": 2127520362.0, "reward": 2.537388563156128, "reward_std": 0.3675636649131775, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.10248777270317078, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1070.0513916015625, "completions/mean_terminated_length": 770.6793212890625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.829151350487454, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1328632875438324, "kl": 0.02593994140625, "learning_rate": 1.7793232727234193e-07, "loss": 0.083, "num_tokens": 2128071857.0, "reward": 2.41015625, "reward_std": 0.41582852602005005, "rewards/accuracy_reward/mean": 0.5347222089767456, "rewards/accuracy_reward/std": 0.499371200799942, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14936910569667816, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1016.58935546875, "completions/mean_terminated_length": 757.2960815429688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.82936444515476, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11711086712330981, "kl": 0.02813720703125, "learning_rate": 1.7774413411302058e-07, "loss": 0.0281, "num_tokens": 2128594937.0, "reward": 2.2455358505249023, "reward_std": 0.36991551518440247, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12449962645769119, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 953.888427734375, "completions/mean_terminated_length": 758.1000366210938, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.829577539822066, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14365553250977833, "kl": 0.030548095703125, "learning_rate": 1.7755614696882726e-07, "loss": 0.0558, "num_tokens": 2129085751.0, "reward": 2.459263563156128, "reward_std": 0.45786023139953613, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416090607643127, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 924.0402221679688, "completions/mean_terminated_length": 743.5077514648438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.829790634489372, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1329382005511629, "kl": 0.03009033203125, "learning_rate": 1.7736836594379847e-07, "loss": 0.0566, "num_tokens": 2129570265.0, "reward": 2.560267925262451, "reward_std": 0.36560624837875366, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09178353101015091, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1041.685302734375, "completions/mean_terminated_length": 816.2267456054688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8300037291566779, "frac_reward_zero_std": 0.0, "grad_norm": 0.13223365330253428, "kl": 0.02740478515625, "learning_rate": 1.7718079114185662e-07, "loss": 0.0836, "num_tokens": 2130105852.0, "reward": 2.4140625, "reward_std": 0.48807382583618164, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1659623235464096, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 963.3527221679688, "completions/mean_terminated_length": 701.9556884765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8302168238239838, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11031613100030942, "kl": 0.027923583984375, "learning_rate": 1.7699342266681e-07, "loss": 0.046, "num_tokens": 2130603482.0, "reward": 2.4732143878936768, "reward_std": 0.359651654958725, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407235741615295, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12383606284856796, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1012.5535888671875, "completions/mean_terminated_length": 824.042236328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8304299184912898, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12201883068951357, "kl": 0.02777099609375, "learning_rate": 1.7680626062235266e-07, "loss": 0.061, "num_tokens": 2131128850.0, "reward": 2.4386162757873535, "reward_std": 0.40508919954299927, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11345604062080383, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1056.07373046875, "completions/mean_terminated_length": 846.9649047851562, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8306430131585957, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10846980145606772, "kl": 0.024993896484375, "learning_rate": 1.7661930511206463e-07, "loss": 0.0484, "num_tokens": 2131669491.0, "reward": 2.502232313156128, "reward_std": 0.34739798307418823, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10247711092233658, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1018.8906860351562, "completions/mean_terminated_length": 774.4061279296875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8308561078259017, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12667823279699056, "kl": 0.02667236328125, "learning_rate": 1.7643255623941117e-07, "loss": 0.0472, "num_tokens": 2132197682.0, "reward": 2.3253350257873535, "reward_std": 0.40768349170684814, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1475696861743927, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1061.3773193359375, "completions/mean_terminated_length": 826.9862060546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8310692024932076, "frac_reward_zero_std": 0.0, "grad_norm": 0.14445302679487643, "kl": 0.024932861328125, "learning_rate": 1.762460141077438e-07, "loss": 0.0949, "num_tokens": 2132741211.0, "reward": 2.251674175262451, "reward_std": 0.41256797313690186, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14025694131851196, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1119.9754638671875, "completions/mean_terminated_length": 866.8778686523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8312822971605136, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12563825568648504, "kl": 0.0252685546875, "learning_rate": 1.7605967882029932e-07, "loss": 0.0819, "num_tokens": 2133317744.0, "reward": 2.2862725257873535, "reward_std": 0.5163499712944031, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13598167896270752, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1097.65625, "completions/mean_terminated_length": 828.0745239257812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8314953918278195, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1269563376185568, "kl": 0.026123046875, "learning_rate": 1.758735504801997e-07, "loss": 0.0667, "num_tokens": 2133883350.0, "reward": 2.3973214626312256, "reward_std": 0.42701372504234314, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1561690717935562, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1003.685302734375, "completions/mean_terminated_length": 755.5884399414062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8317084864951254, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11602401799526892, "kl": 0.02911376953125, "learning_rate": 1.7568762919045306e-07, "loss": 0.0345, "num_tokens": 2134405209.0, "reward": 2.423549175262451, "reward_std": 0.34538063406944275, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1042.6942138671875, "completions/mean_terminated_length": 782.8960571289062, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8319215811624314, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12879749600557552, "kl": 0.026885986328125, "learning_rate": 1.755019150539524e-07, "loss": 0.0564, "num_tokens": 2134939952.0, "reward": 2.388951063156128, "reward_std": 0.3237634301185608, "rewards/accuracy_reward/mean": 0.46296295523643494, "rewards/accuracy_reward/std": 0.49920445680618286, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1098259910941124, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 944.3370971679688, "completions/mean_terminated_length": 776.9434204101562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8321346758297373, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1388903616038368, "kl": 0.030670166015625, "learning_rate": 1.7531640817347659e-07, "loss": 0.1158, "num_tokens": 2135432247.0, "reward": 2.5262277126312256, "reward_std": 0.4996793866157532, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14611591398715973, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1064.638427734375, "completions/mean_terminated_length": 844.3223876953125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8323477704970433, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1257078028503783, "kl": 0.025299072265625, "learning_rate": 1.7513110865168902e-07, "loss": 0.0656, "num_tokens": 2135979605.0, "reward": 2.4140625, "reward_std": 0.4264105260372162, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12695714831352234, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 925.1942138671875, "completions/mean_terminated_length": 703.0347900390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8325608651643492, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13144413664802096, "kl": 0.029876708984375, "learning_rate": 1.7494601659113915e-07, "loss": 0.0315, "num_tokens": 2136461692.0, "reward": 2.4776787757873535, "reward_std": 0.3364052176475525, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11849905550479889, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 858.0535888671875, "completions/mean_terminated_length": 670.490966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8327739598316553, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13635299207740328, "kl": 0.035430908203125, "learning_rate": 1.7476113209426118e-07, "loss": 0.047, "num_tokens": 2136913172.0, "reward": 2.451451063156128, "reward_std": 0.4038492739200592, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1273927539587021, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1030.8795166015625, "completions/mean_terminated_length": 775.1787719726562, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8329870544989612, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11100953713700229, "kl": 0.027191162109375, "learning_rate": 1.745764552633745e-07, "loss": 0.0602, "num_tokens": 2137446350.0, "reward": 2.44921875, "reward_std": 0.34960824251174927, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09953393787145615, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 872.1563110351562, "completions/mean_terminated_length": 714.3848266601562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8332001491662672, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11625744194766788, "kl": 0.0311279296875, "learning_rate": 1.7439198620068362e-07, "loss": 0.0208, "num_tokens": 2137908932.0, "reward": 2.564732313156128, "reward_std": 0.4211812913417816, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10649171471595764, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1030.837158203125, "completions/mean_terminated_length": 823.029541015625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8334132438335731, "frac_reward_zero_std": 0.0, "grad_norm": 0.13610590295865335, "kl": 0.02581787109375, "learning_rate": 1.7420772500827795e-07, "loss": 0.08, "num_tokens": 2138452475.0, "reward": 2.4207589626312256, "reward_std": 0.4782007038593292, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1555563360452652, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1043.8035888671875, "completions/mean_terminated_length": 801.7949829101562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.833626338500879, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11679535852442592, "kl": 0.0257568359375, "learning_rate": 1.740236717881322e-07, "loss": 0.0624, "num_tokens": 2138990803.0, "reward": 2.33984375, "reward_std": 0.36936238408088684, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134778022766, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10500273108482361, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 840.1317138671875, "completions/mean_terminated_length": 678.0632934570312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.833839433168185, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14948153743774778, "kl": 0.033111572265625, "learning_rate": 1.7383982664210556e-07, "loss": 0.1414, "num_tokens": 2139434814.0, "reward": 2.536830425262451, "reward_std": 0.4081796109676361, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.4822678565979004, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.1194591298699379, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1040.1898193359375, "completions/mean_terminated_length": 811.0164794921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.8340525278354909, "frac_reward_zero_std": 0.0, "grad_norm": 0.1282646551445139, "kl": 0.027313232421875, "learning_rate": 1.7365618967194208e-07, "loss": 0.0811, "num_tokens": 2139973907.0, "reward": 2.421875, "reward_std": 0.460305392742157, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13376134634017944, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 961.0558471679688, "completions/mean_terminated_length": 786.4688720703125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8342656225027969, "frac_reward_zero_std": 0.0, "grad_norm": 0.15055271343526974, "kl": 0.028961181640625, "learning_rate": 1.7347276097927105e-07, "loss": 0.0922, "num_tokens": 2140476140.0, "reward": 2.376674175262451, "reward_std": 0.47188296914100647, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14989471435546875, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1180.337158203125, "completions/mean_terminated_length": 870.081787109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8344787171701028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.9690972641212007, "kl": 0.04638671875, "learning_rate": 1.732895406656062e-07, "loss": 0.0788, "num_tokens": 2141074611.0, "reward": 2.318638563156128, "reward_std": 0.47845640778541565, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.17149938642978668, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 979.700927734375, "completions/mean_terminated_length": 733.1703491210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8346918118374088, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13941158294168482, "kl": 0.0284423828125, "learning_rate": 1.7310652883234584e-07, "loss": 0.0813, "num_tokens": 2141581901.0, "reward": 2.396205425262451, "reward_std": 0.388677179813385, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11015089601278305, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 962.8147583007812, "completions/mean_terminated_length": 719.685791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8349049065047147, "frac_reward_zero_std": 0.0, "grad_norm": 0.1373743387548222, "kl": 0.030120849609375, "learning_rate": 1.729237255807729e-07, "loss": 0.0897, "num_tokens": 2142078874.0, "reward": 2.4246652126312256, "reward_std": 0.4762446880340576, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.126290425658226, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1038.0535888671875, "completions/mean_terminated_length": 825.14599609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8351180011720206, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15000978176844298, "kl": 0.025421142578125, "learning_rate": 1.7274113101205523e-07, "loss": 0.0299, "num_tokens": 2142618370.0, "reward": 2.4190850257873535, "reward_std": 0.38210952281951904, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.09933306276798248, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 925.8326416015625, "completions/mean_terminated_length": 725.0237426757812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8353310958393266, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1339903724570275, "kl": 0.029205322265625, "learning_rate": 1.7255874522724494e-07, "loss": 0.0442, "num_tokens": 2143101927.0, "reward": 2.5066964626312256, "reward_std": 0.4270004332065582, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12717820703983307, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 992.63623046875, "completions/mean_terminated_length": 780.431640625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8355441905066325, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11098405181852714, "kl": 0.02679443359375, "learning_rate": 1.7237656832727826e-07, "loss": 0.0739, "num_tokens": 2143612724.0, "reward": 2.513951063156128, "reward_std": 0.36678698658943176, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10326440632343292, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 919.18310546875, "completions/mean_terminated_length": 747.9743041992188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8357572851739385, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14706129399313073, "kl": 0.030242919921875, "learning_rate": 1.7219460041297657e-07, "loss": 0.0939, "num_tokens": 2144090758.0, "reward": 2.4380581378936768, "reward_std": 0.4117111563682556, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.14050593972206116, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 904.6004638671875, "completions/mean_terminated_length": 751.1823120117188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8359703798412444, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1262382231061428, "kl": 0.029693603515625, "learning_rate": 1.7201284158504497e-07, "loss": 0.0713, "num_tokens": 2144572451.0, "reward": 2.4073662757873535, "reward_std": 0.40641263127326965, "rewards/accuracy_reward/mean": 0.5300925970077515, "rewards/accuracy_reward/std": 0.4996722638607025, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13295157253742218, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1057.93310546875, "completions/mean_terminated_length": 809.0335083007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8361834745085505, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12624698429513712, "kl": 0.025390625, "learning_rate": 1.7183129194407317e-07, "loss": 0.123, "num_tokens": 2145108485.0, "reward": 2.384486675262451, "reward_std": 0.40201154351234436, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791021347046, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14585080742835999, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1012.5535888671875, "completions/mean_terminated_length": 810.9866333007812, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8363965691758564, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11714841837828893, "kl": 0.024200439453125, "learning_rate": 1.716499515905349e-07, "loss": 0.0245, "num_tokens": 2145634333.0, "reward": 2.453125, "reward_std": 0.35915809869766235, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11771687865257263, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1052.7210693359375, "completions/mean_terminated_length": 836.3560180664062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8366096638431624, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1269775391191213, "kl": 0.02728271484375, "learning_rate": 1.7146882062478807e-07, "loss": 0.077, "num_tokens": 2146175296.0, "reward": 2.4598214626312256, "reward_std": 0.40136730670928955, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10517053306102753, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1059.265625, "completions/mean_terminated_length": 831.09619140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8368227585104683, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13080246427676054, "kl": 0.026092529296875, "learning_rate": 1.7128789914707504e-07, "loss": 0.0467, "num_tokens": 2146721095.0, "reward": 2.3604912757873535, "reward_std": 0.43019357323646545, "rewards/accuracy_reward/mean": 0.46990740299224854, "rewards/accuracy_reward/std": 0.4996722638607025, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1110.8125, "completions/mean_terminated_length": 771.8297729492188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8370358531777742, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11130727770213093, "kl": 0.024932861328125, "learning_rate": 1.7110718725752188e-07, "loss": 0.0549, "num_tokens": 2147287747.0, "reward": 2.380580425262451, "reward_std": 0.37004560232162476, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399910926818848, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1059.35498046875, "completions/mean_terminated_length": 807.3473510742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8372489478450802, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14371740086122145, "kl": 0.02691650390625, "learning_rate": 1.7092668505613883e-07, "loss": 0.099, "num_tokens": 2147837106.0, "reward": 2.364955425262451, "reward_std": 0.49356189370155334, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14114972949028015, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 993.9777221679688, "completions/mean_terminated_length": 771.7783813476562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8374620425123861, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13519628448879453, "kl": 0.028076171875, "learning_rate": 1.7074639264281998e-07, "loss": 0.0532, "num_tokens": 2148357624.0, "reward": 2.3560268878936768, "reward_std": 0.38538238406181335, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09605696052312851, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1015.138427734375, "completions/mean_terminated_length": 833.5065307617188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8376751371796921, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12088348123742187, "kl": 0.02813720703125, "learning_rate": 1.705663101173434e-07, "loss": 0.0501, "num_tokens": 2148880438.0, "reward": 2.5223214626312256, "reward_std": 0.4124482572078705, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14903545379638672, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1020.04248046875, "completions/mean_terminated_length": 793.1634521484375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.837888231846998, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.6012469312318556, "kl": 0.031951904296875, "learning_rate": 1.7038643757937106e-07, "loss": 0.0768, "num_tokens": 2149414249.0, "reward": 2.459263563156128, "reward_std": 0.4629363417625427, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16142748296260834, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1141.0982666015625, "completions/mean_terminated_length": 849.49853515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.838101326514304, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12076342193411635, "kl": 0.02386474609375, "learning_rate": 1.7020677512844843e-07, "loss": 0.0769, "num_tokens": 2150004821.0, "reward": 2.3080358505249023, "reward_std": 0.46129682660102844, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16740410029888153, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1060.01123046875, "completions/mean_terminated_length": 818.5028076171875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8383144211816099, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 1.2271355853532218, "kl": 0.029296875, "learning_rate": 1.7002732286400523e-07, "loss": 0.0694, "num_tokens": 2150554970.0, "reward": 2.341517925262451, "reward_std": 0.4885002374649048, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15456202626228333, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1033.759033203125, "completions/mean_terminated_length": 842.7479858398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8385275158489158, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10985055987365569, "kl": 0.025726318359375, "learning_rate": 1.6984808088535436e-07, "loss": 0.0465, "num_tokens": 2151088478.0, "reward": 2.4732143878936768, "reward_std": 0.36709195375442505, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.1042405441403389, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1047.24560546875, "completions/mean_terminated_length": 846.021484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8387406105162218, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12345973871857009, "kl": 0.02716064453125, "learning_rate": 1.6966904929169258e-07, "loss": 0.1071, "num_tokens": 2151627180.0, "reward": 2.4933037757873535, "reward_std": 0.45501869916915894, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13271193206310272, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1094.609375, "completions/mean_terminated_length": 806.375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8389537051835277, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1304823868169798, "kl": 0.026519775390625, "learning_rate": 1.6949022818210012e-07, "loss": 0.0603, "num_tokens": 2152197485.0, "reward": 2.34375, "reward_std": 0.43222954869270325, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 977.43310546875, "completions/mean_terminated_length": 741.1498413085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8391667998508338, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13443368921811552, "kl": 0.02630615234375, "learning_rate": 1.6931161765554076e-07, "loss": 0.087, "num_tokens": 2152699743.0, "reward": 2.3627233505249023, "reward_std": 0.48113736510276794, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15520282089710236, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1061.430908203125, "completions/mean_terminated_length": 827.052490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8393798945181397, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12110238324440441, "kl": 0.027862548828125, "learning_rate": 1.6913321781086202e-07, "loss": 0.0699, "num_tokens": 2153247088.0, "reward": 2.369419813156128, "reward_std": 0.41558340191841125, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14409084618091583, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1041.044677734375, "completions/mean_terminated_length": 773.6610107421875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8395929891854457, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11303236677351514, "kl": 0.026824951171875, "learning_rate": 1.6895502874679413e-07, "loss": 0.0191, "num_tokens": 2153784388.0, "reward": 2.4799108505249023, "reward_std": 0.3943043649196625, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1026.0335693359375, "completions/mean_terminated_length": 861.8834228515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8398060838527516, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11558990207418297, "kl": 0.027130126953125, "learning_rate": 1.6877705056195146e-07, "loss": 0.0434, "num_tokens": 2154315283.0, "reward": 2.498326063156128, "reward_std": 0.4462723135948181, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13440261781215668, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1085.747802734375, "completions/mean_terminated_length": 833.6647338867188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8400191785200576, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.11042895644084216, "kl": 0.022735595703125, "learning_rate": 1.6859928335483114e-07, "loss": 0.0695, "num_tokens": 2154875634.0, "reward": 2.4112725257873535, "reward_std": 0.3485161364078522, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.13213567435741425, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1058.0535888671875, "completions/mean_terminated_length": 871.6180419921875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8402322731873635, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1286800157820968, "kl": 0.025238037109375, "learning_rate": 1.6842172722381375e-07, "loss": 0.0796, "num_tokens": 2155421898.0, "reward": 2.474888563156128, "reward_std": 0.4429524838924408, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.150824636220932, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 970.8951416015625, "completions/mean_terminated_length": 781.48291015625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8404453678546694, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14490422154992252, "kl": 0.0281982421875, "learning_rate": 1.6824438226716305e-07, "loss": 0.1044, "num_tokens": 2155918459.0, "reward": 2.4642858505249023, "reward_std": 0.42229321599006653, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1041.5670166015625, "completions/mean_terminated_length": 855.1904296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8406584625219754, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12608539359190912, "kl": 0.029754638671875, "learning_rate": 1.6806724858302574e-07, "loss": 0.0596, "num_tokens": 2156453385.0, "reward": 2.4921875, "reward_std": 0.4592643678188324, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1319335699081421, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1111.3482666015625, "completions/mean_terminated_length": 835.2254028320312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8408715571892813, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11003676468301371, "kl": 0.023895263671875, "learning_rate": 1.6789032626943194e-07, "loss": 0.0975, "num_tokens": 2157024213.0, "reward": 2.396205425262451, "reward_std": 0.3622308075428009, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389531940221786, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1056.4085693359375, "completions/mean_terminated_length": 803.64990234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8410846518565873, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12024458695282089, "kl": 0.025421142578125, "learning_rate": 1.6771361542429463e-07, "loss": 0.0192, "num_tokens": 2157572252.0, "reward": 2.385044813156128, "reward_std": 0.3676413297653198, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.0999298095703125, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1025.8192138671875, "completions/mean_terminated_length": 747.0426635742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8412977465238932, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12444682506885027, "kl": 0.02740478515625, "learning_rate": 1.6753711614540961e-07, "loss": 0.0839, "num_tokens": 2158101803.0, "reward": 2.4838171005249023, "reward_std": 0.44563043117523193, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1002.6875610351562, "completions/mean_terminated_length": 778.8943481445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8415108411911992, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1477206465752429, "kl": 0.02728271484375, "learning_rate": 1.6736082853045576e-07, "loss": 0.1316, "num_tokens": 2158620015.0, "reward": 2.4715402126312256, "reward_std": 0.4965323507785797, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1546175479888916, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 936.66748046875, "completions/mean_terminated_length": 784.352783203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8417239358585051, "frac_reward_zero_std": 0.0, "grad_norm": 0.13059005084397984, "kl": 0.029632568359375, "learning_rate": 1.671847526769948e-07, "loss": 0.0349, "num_tokens": 2159102826.0, "reward": 2.486607313156128, "reward_std": 0.45923885703086853, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13940991461277008, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 992.4910888671875, "completions/mean_terminated_length": 730.8189086914062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.841937030525811, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12847389270482107, "kl": 0.028472900390625, "learning_rate": 1.670088886824712e-07, "loss": 0.1107, "num_tokens": 2159612214.0, "reward": 2.4620537757873535, "reward_std": 0.44468992948532104, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15850186347961426, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1041.493408203125, "completions/mean_terminated_length": 835.8629150390625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.842150125193117, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12682891652199707, "kl": 0.0263671875, "learning_rate": 1.6683323664421218e-07, "loss": 0.0748, "num_tokens": 2160149699.0, "reward": 2.3058037757873535, "reward_std": 0.45571643114089966, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.494759202003479, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14658613502979279, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1037.1763916015625, "completions/mean_terminated_length": 768.7655639648438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.842363219860423, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14278411919562792, "kl": 0.026275634765625, "learning_rate": 1.666577966594278e-07, "loss": 0.0742, "num_tokens": 2160684610.0, "reward": 2.2650671005249023, "reward_std": 0.44000667333602905, "rewards/accuracy_reward/mean": 0.40509259700775146, "rewards/accuracy_reward/std": 0.49147912859916687, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15792487561702728, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 977.9777221679688, "completions/mean_terminated_length": 793.104736328125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.842576314527729, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15017519648572, "kl": 0.027130126953125, "learning_rate": 1.6648256882521078e-07, "loss": 0.1066, "num_tokens": 2161190568.0, "reward": 2.424107313156128, "reward_std": 0.46449989080429077, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17606906592845917, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 934.1160888671875, "completions/mean_terminated_length": 741.6649169921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8427894091950349, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12163427570938457, "kl": 0.027862548828125, "learning_rate": 1.6630755323853597e-07, "loss": 0.0742, "num_tokens": 2161675580.0, "reward": 2.537388563156128, "reward_std": 0.4003835618495941, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15880775451660156, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 889.1607666015625, "completions/mean_terminated_length": 713.3984375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8430025038623409, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15176354224383629, "kl": 0.03155517578125, "learning_rate": 1.6613274999626134e-07, "loss": 0.1254, "num_tokens": 2162141652.0, "reward": 2.5184152126312256, "reward_std": 0.4287479519844055, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13335825502872467, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1153.19873046875, "completions/mean_terminated_length": 937.5540161132812, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8432155985296468, "frac_reward_zero_std": 0.0, "grad_norm": 0.11292106488318103, "kl": 0.024505615234375, "learning_rate": 1.65958159195127e-07, "loss": 0.0422, "num_tokens": 2162728445.0, "reward": 2.36328125, "reward_std": 0.4955526888370514, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14843007922172546, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 971.6428833007812, "completions/mean_terminated_length": 824.121826171875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8434286931969528, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14502986764941458, "kl": 0.03533935546875, "learning_rate": 1.6578378093175582e-07, "loss": 0.084, "num_tokens": 2163234909.0, "reward": 2.3800225257873535, "reward_std": 0.5102487206459045, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1810806840658188, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1077.015625, "completions/mean_terminated_length": 812.2017211914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8436417878642587, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11107100531797799, "kl": 0.025238037109375, "learning_rate": 1.6560961530265243e-07, "loss": 0.0661, "num_tokens": 2163795876.0, "reward": 2.33203125, "reward_std": 0.35405829548835754, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.11001905798912048, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1037.85498046875, "completions/mean_terminated_length": 758.697998046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8438548825315646, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14957969269666366, "kl": 0.0294189453125, "learning_rate": 1.6543566240420456e-07, "loss": 0.0955, "num_tokens": 2164333875.0, "reward": 2.322544813156128, "reward_std": 0.4018864929676056, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14311718940734863, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 903.1339721679688, "completions/mean_terminated_length": 756.0604248046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.8440679771988706, "frac_reward_zero_std": 0.0, "grad_norm": 0.1465855151067479, "kl": 0.029327392578125, "learning_rate": 1.652619223326816e-07, "loss": 0.0587, "num_tokens": 2164812671.0, "reward": 2.5675225257873535, "reward_std": 0.4471858739852905, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1119314432144165, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 963.76123046875, "completions/mean_terminated_length": 796.0953369140625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8442810718661765, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13034256464843505, "kl": 0.0286865234375, "learning_rate": 1.6508839518423547e-07, "loss": 0.0512, "num_tokens": 2165314804.0, "reward": 2.498326063156128, "reward_std": 0.3727579712867737, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523062229156, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1073.700927734375, "completions/mean_terminated_length": 832.1615600585938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8444941665334825, "frac_reward_zero_std": 0.0, "grad_norm": 0.13164835590084706, "kl": 0.0250244140625, "learning_rate": 1.649150810549001e-07, "loss": 0.0731, "num_tokens": 2165862606.0, "reward": 2.3504464626312256, "reward_std": 0.4473680555820465, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14088857173919678, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1078.216552734375, "completions/mean_terminated_length": 799.5430908203125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8447072612007884, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12167559614533628, "kl": 0.02386474609375, "learning_rate": 1.6474198004059153e-07, "loss": 0.055, "num_tokens": 2166418079.0, "reward": 2.3621652126312256, "reward_std": 0.3951001465320587, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.13750630617141724, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 984.2656860351562, "completions/mean_terminated_length": 790.604248046875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8449203558680944, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14158311362860126, "kl": 0.028594970703125, "learning_rate": 1.645690922371083e-07, "loss": 0.0742, "num_tokens": 2166930758.0, "reward": 2.404017925262451, "reward_std": 0.3562011122703552, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13165414333343506, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1012.966552734375, "completions/mean_terminated_length": 818.039794921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8451334505354003, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15481955372692918, "kl": 0.029296875, "learning_rate": 1.6439641774013013e-07, "loss": 0.0618, "num_tokens": 2167455095.0, "reward": 2.4291296005249023, "reward_std": 0.4108121693134308, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12956927716732025, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 989.8995971679688, "completions/mean_terminated_length": 780.5427856445312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8453465452027064, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12174940001540113, "kl": 0.027099609375, "learning_rate": 1.642239566452192e-07, "loss": 0.0512, "num_tokens": 2167971322.0, "reward": 2.4698662757873535, "reward_std": 0.4359026849269867, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10969661176204681, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1051.71875, "completions/mean_terminated_length": 825.1671752929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8455596398700123, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12803480056938524, "kl": 0.0255126953125, "learning_rate": 1.6405170904781977e-07, "loss": 0.0481, "num_tokens": 2168507404.0, "reward": 2.4799108505249023, "reward_std": 0.4365629255771637, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1003.6920166015625, "completions/mean_terminated_length": 807.0185546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8457727345373182, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1087610009346226, "kl": 0.02685546875, "learning_rate": 1.638796750432575e-07, "loss": 0.0582, "num_tokens": 2169026082.0, "reward": 2.3482143878936768, "reward_std": 0.37354791164398193, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.08746546506881714, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1151.779052734375, "completions/mean_terminated_length": 838.6415405273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8459858292046242, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13314493535206487, "kl": 0.023834228515625, "learning_rate": 1.6370785472674016e-07, "loss": 0.109, "num_tokens": 2169618431.0, "reward": 2.236607313156128, "reward_std": 0.47649946808815, "rewards/accuracy_reward/mean": 0.3504464328289032, "rewards/accuracy_reward/std": 0.47764310240745544, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1657252162694931, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1014.5781860351562, "completions/mean_terminated_length": 786.4931640625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8461989238719301, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1314455788394422, "kl": 0.0267333984375, "learning_rate": 1.6353624819335694e-07, "loss": 0.072, "num_tokens": 2170141506.0, "reward": 2.376674175262451, "reward_std": 0.47302737832069397, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1552460640668869, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 994.91748046875, "completions/mean_terminated_length": 741.1273803710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8464120185392361, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13290918244610742, "kl": 0.0291748046875, "learning_rate": 1.6336485553807917e-07, "loss": 0.0585, "num_tokens": 2170656173.0, "reward": 2.3677456378936768, "reward_std": 0.4094289541244507, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15122967958450317, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1031.325927734375, "completions/mean_terminated_length": 813.6640014648438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.846625113206542, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1298284687566422, "kl": 0.02508544921875, "learning_rate": 1.6319367685575957e-07, "loss": 0.0846, "num_tokens": 2171193391.0, "reward": 2.3705358505249023, "reward_std": 0.454289048910141, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 918.7522583007812, "completions/mean_terminated_length": 716.6763305664062, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.846838207873848, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12502940832595105, "kl": 0.02972412109375, "learning_rate": 1.6302271224113213e-07, "loss": 0.0745, "num_tokens": 2171674480.0, "reward": 2.5167412757873535, "reward_std": 0.3748437762260437, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765028238297, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 898.825927734375, "completions/mean_terminated_length": 717.68994140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8470513025411539, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1288293885664111, "kl": 0.0308837890625, "learning_rate": 1.6285196178881296e-07, "loss": 0.0332, "num_tokens": 2172141154.0, "reward": 2.536830425262451, "reward_std": 0.3844040632247925, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13170628249645233, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1036.421875, "completions/mean_terminated_length": 823.1702880859375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8472643972084598, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1258931991499077, "kl": 0.0252685546875, "learning_rate": 1.6268142559329934e-07, "loss": 0.0943, "num_tokens": 2172682111.0, "reward": 2.4720983505249023, "reward_std": 0.40248000621795654, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10269007831811905, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1012.997802734375, "completions/mean_terminated_length": 843.6337280273438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8474774918757658, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12520259227471073, "kl": 0.02728271484375, "learning_rate": 1.6251110374896993e-07, "loss": 0.0749, "num_tokens": 2173215390.0, "reward": 2.5574777126312256, "reward_std": 0.44498884677886963, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14611589908599854, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1071.118408203125, "completions/mean_terminated_length": 835.6925048828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8476905865430717, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13233741484767567, "kl": 0.02618408203125, "learning_rate": 1.623409963500848e-07, "loss": 0.0938, "num_tokens": 2173757235.0, "reward": 2.390625, "reward_std": 0.444413423538208, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16025643050670624, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 949.2344360351562, "completions/mean_terminated_length": 752.6132202148438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8479036812103777, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1385214731696971, "kl": 0.028289794921875, "learning_rate": 1.6217110349078535e-07, "loss": 0.0657, "num_tokens": 2174268956.0, "reward": 2.388951063156128, "reward_std": 0.46755221486091614, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14494065940380096, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1034.462158203125, "completions/mean_terminated_length": 797.1322631835938, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.8481167758776836, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12131822339574642, "kl": 0.025238037109375, "learning_rate": 1.6200142526509445e-07, "loss": 0.0844, "num_tokens": 2174805307.0, "reward": 2.4112725257873535, "reward_std": 0.43319904804229736, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151519536972046, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 998.62060546875, "completions/mean_terminated_length": 833.2144775390625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8483298705449897, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12282210959777312, "kl": 0.029052734375, "learning_rate": 1.6183196176691595e-07, "loss": 0.0439, "num_tokens": 2175321521.0, "reward": 2.4190850257873535, "reward_std": 0.39597976207733154, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11540208011865616, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1024.310302734375, "completions/mean_terminated_length": 781.11328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8485429652122956, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1482734076998015, "kl": 0.028717041015625, "learning_rate": 1.616627130900348e-07, "loss": 0.1002, "num_tokens": 2175852620.0, "reward": 2.39453125, "reward_std": 0.5537673830986023, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9503348469734192, "rewards/tag_count_reward/std": 0.1743151992559433, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1001.6652221679688, "completions/mean_terminated_length": 774.2011108398438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8487560598796016, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12847984390054223, "kl": 0.026519775390625, "learning_rate": 1.6149367932811725e-07, "loss": 0.0474, "num_tokens": 2176366806.0, "reward": 2.3956475257873535, "reward_std": 0.409063458442688, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1310732513666153, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1042.4285888671875, "completions/mean_terminated_length": 813.764404296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8489691545469075, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1126761313172664, "kl": 0.02685546875, "learning_rate": 1.613248605747107e-07, "loss": 0.071, "num_tokens": 2176901846.0, "reward": 2.3900671005249023, "reward_std": 0.4422467350959778, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15614411234855652, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1043.0960693359375, "completions/mean_terminated_length": 821.30517578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8491822492142134, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1269478890579014, "kl": 0.027801513671875, "learning_rate": 1.611562569232432e-07, "loss": 0.0491, "num_tokens": 2177444225.0, "reward": 2.3900671005249023, "reward_std": 0.5063492655754089, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16087746620178223, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 989.0223388671875, "completions/mean_terminated_length": 748.2137451171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8493953438815194, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13698895792540383, "kl": 0.027130126953125, "learning_rate": 1.6098786846702393e-07, "loss": 0.0888, "num_tokens": 2177966763.0, "reward": 2.4090402126312256, "reward_std": 0.3535729944705963, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801211535930634, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1001.3795166015625, "completions/mean_terminated_length": 784.1563110351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8496084385488253, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.2423596646678693, "kl": 0.02880859375, "learning_rate": 1.6081969529924325e-07, "loss": 0.0538, "num_tokens": 2178488469.0, "reward": 2.447544813156128, "reward_std": 0.4372623562812805, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.1088741272687912, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1128.3660888671875, "completions/mean_terminated_length": 864.1034545898438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8498215332161313, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13649500116038935, "kl": 0.02642822265625, "learning_rate": 1.6065173751297202e-07, "loss": 0.1004, "num_tokens": 2179063609.0, "reward": 2.3359375, "reward_std": 0.5194458365440369, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16402535140514374, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1166.9085693359375, "completions/mean_terminated_length": 866.1766967773438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8500346278834372, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10804173824131713, "kl": 0.023468017578125, "learning_rate": 1.60483995201162e-07, "loss": 0.0284, "num_tokens": 2179664832.0, "reward": 2.2622768878936768, "reward_std": 0.4234698414802551, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14876297116279602, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 968.19873046875, "completions/mean_terminated_length": 737.0216674804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8502477225507432, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11653191083215435, "kl": 0.026947021484375, "learning_rate": 1.6031646845664567e-07, "loss": 0.008, "num_tokens": 2180167145.0, "reward": 2.4776787757873535, "reward_std": 0.37293896079063416, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12935835123062134, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1053.680908203125, "completions/mean_terminated_length": 834.2261352539062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8504608172180491, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10996306012586778, "kl": 0.027618408203125, "learning_rate": 1.601491573721363e-07, "loss": 0.0491, "num_tokens": 2180703178.0, "reward": 2.421875, "reward_std": 0.4754781723022461, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.1700088381767273, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 971.8839721679688, "completions/mean_terminated_length": 802.2636108398438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.850673911885355, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13696045285343755, "kl": 0.028961181640625, "learning_rate": 1.5998206204022792e-07, "loss": 0.0593, "num_tokens": 2181209350.0, "reward": 2.439174175262451, "reward_std": 0.45443442463874817, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1516006886959076, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 978.77685546875, "completions/mean_terminated_length": 829.1399536132812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.850887006552661, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11563832390311514, "kl": 0.02716064453125, "learning_rate": 1.5981518255339469e-07, "loss": 0.0833, "num_tokens": 2181717890.0, "reward": 2.4933037757873535, "reward_std": 0.39904528856277466, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12425874918699265, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1083.84375, "completions/mean_terminated_length": 854.7901000976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8511001012199669, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14586532128799606, "kl": 0.0250244140625, "learning_rate": 1.596485190039919e-07, "loss": 0.1078, "num_tokens": 2182269388.0, "reward": 2.361049175262451, "reward_std": 0.5348119139671326, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9436383843421936, "rewards/tag_count_reward/std": 0.19221055507659912, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1034.0826416015625, "completions/mean_terminated_length": 810.3024291992188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.8513131958872729, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13827242714387283, "kl": 0.025970458984375, "learning_rate": 1.5948207148425503e-07, "loss": 0.1132, "num_tokens": 2182804017.0, "reward": 2.4146206378936768, "reward_std": 0.46741876006126404, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13776934146881104, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1035.696533203125, "completions/mean_terminated_length": 815.6304321289062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8515262905545788, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12302089425598965, "kl": 0.02520751953125, "learning_rate": 1.5931584008629998e-07, "loss": 0.027, "num_tokens": 2183336617.0, "reward": 2.4765625, "reward_std": 0.41702374815940857, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09310024231672287, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1064.1898193359375, "completions/mean_terminated_length": 816.8630981445312, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.8517393852218849, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12656745337052655, "kl": 0.026397705078125, "learning_rate": 1.591498249021231e-07, "loss": 0.0332, "num_tokens": 2183880894.0, "reward": 2.5083706378936768, "reward_std": 0.39086827635765076, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10833819210529327, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1139.8616943359375, "completions/mean_terminated_length": 847.8643188476562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8519524798891908, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12295284507759542, "kl": 0.02325439453125, "learning_rate": 1.5898402602360102e-07, "loss": 0.1209, "num_tokens": 2184465648.0, "reward": 2.373326063156128, "reward_std": 0.474073588848114, "rewards/accuracy_reward/mean": 0.5092592835426331, "rewards/accuracy_reward/std": 0.5004938840866089, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16232210397720337, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1128.9754638671875, "completions/mean_terminated_length": 888.2168579101562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8521655745564968, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12440389870639547, "kl": 0.0252685546875, "learning_rate": 1.588184435424909e-07, "loss": 0.088, "num_tokens": 2185045733.0, "reward": 2.3448662757873535, "reward_std": 0.4449191391468048, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1576608121395111, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 985.1027221679688, "completions/mean_terminated_length": 781.5691528320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8523786692238027, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12281072038839168, "kl": 0.028350830078125, "learning_rate": 1.5865307755042988e-07, "loss": 0.0984, "num_tokens": 2185557763.0, "reward": 2.513951063156128, "reward_std": 0.4216967523097992, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 960.8839721679688, "completions/mean_terminated_length": 802.4041137695312, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.8525917638911086, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14072339744804316, "kl": 0.0291748046875, "learning_rate": 1.584879281389354e-07, "loss": 0.0692, "num_tokens": 2186057263.0, "reward": 2.4408483505249023, "reward_std": 0.4403288960456848, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15520283579826355, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 981.5670166015625, "completions/mean_terminated_length": 749.7337036132812, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8528048585584146, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11975955323051019, "kl": 0.027374267578125, "learning_rate": 1.5832299539940503e-07, "loss": 0.0624, "num_tokens": 2186566509.0, "reward": 2.4564733505249023, "reward_std": 0.41955676674842834, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13246241211891174, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 995.9464721679688, "completions/mean_terminated_length": 823.7921752929688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8530179532257205, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1453832982563269, "kl": 0.027618408203125, "learning_rate": 1.5815827942311634e-07, "loss": 0.0918, "num_tokens": 2187078837.0, "reward": 2.404017925262451, "reward_std": 0.5145753622055054, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16402915120124817, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 922.7567138671875, "completions/mean_terminated_length": 742.0181274414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8532310478930265, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14153947804040695, "kl": 0.02789306640625, "learning_rate": 1.5799378030122707e-07, "loss": 0.0641, "num_tokens": 2187559912.0, "reward": 2.4693081378936768, "reward_std": 0.3319628834724426, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11234336346387863, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 950.4129638671875, "completions/mean_terminated_length": 790.4066162109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8534441425603324, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11939264631217629, "kl": 0.028778076171875, "learning_rate": 1.578294981247748e-07, "loss": 0.0253, "num_tokens": 2188054481.0, "reward": 2.4732143878936768, "reward_std": 0.3730962872505188, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1061.82373046875, "completions/mean_terminated_length": 796.4220581054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8536572372276384, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10839936425407633, "kl": 0.024200439453125, "learning_rate": 1.5766543298467732e-07, "loss": 0.0783, "num_tokens": 2188595090.0, "reward": 2.4441964626312256, "reward_std": 0.4165492653846741, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11771687865257263, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1232.796875, "completions/mean_terminated_length": 957.8179321289062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8538703318949443, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10946990379207705, "kl": 0.0228271484375, "learning_rate": 1.575015849717321e-07, "loss": 0.0377, "num_tokens": 2189216151.0, "reward": 2.2901787757873535, "reward_std": 0.4693734049797058, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 987.2344360351562, "completions/mean_terminated_length": 797.4132080078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8540834265622502, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.4228329953421936, "kl": 0.02996826171875, "learning_rate": 1.5733795417661624e-07, "loss": 0.0752, "num_tokens": 2189734496.0, "reward": 2.446986675262451, "reward_std": 0.3441595435142517, "rewards/accuracy_reward/mean": 0.5578703880310059, "rewards/accuracy_reward/std": 0.49721553921699524, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1199323758482933, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1041.602783203125, "completions/mean_terminated_length": 832.7277221679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8542965212295562, "frac_reward_zero_std": 0.0, "grad_norm": 0.14216195771208615, "kl": 0.027801513671875, "learning_rate": 1.5717454068988716e-07, "loss": 0.121, "num_tokens": 2190267630.0, "reward": 2.4464287757873535, "reward_std": 0.5481163859367371, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134778022766, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.16626667976379395, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 999.0000610351562, "completions/mean_terminated_length": 788.0750732421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8545096158968621, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13551341729820773, "kl": 0.0283203125, "learning_rate": 1.5701134460198145e-07, "loss": 0.0712, "num_tokens": 2190789582.0, "reward": 2.423549175262451, "reward_std": 0.4042442739009857, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1330958753824234, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1047.200927734375, "completions/mean_terminated_length": 839.4878540039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8547227105641682, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14192018750225546, "kl": 0.026397705078125, "learning_rate": 1.5684836600321595e-07, "loss": 0.1099, "num_tokens": 2191327576.0, "reward": 2.3895089626312256, "reward_std": 0.46169066429138184, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14629201591014862, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1074.7054443359375, "completions/mean_terminated_length": 833.4150390625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.854935805231474, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14206528118573122, "kl": 0.02886962890625, "learning_rate": 1.5668560498378652e-07, "loss": 0.0556, "num_tokens": 2191879956.0, "reward": 2.275669813156128, "reward_std": 0.3963678479194641, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1464626044034958, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 985.8995971679688, "completions/mean_terminated_length": 755.0081787109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8551488998987801, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1391856678359491, "kl": 0.028564453125, "learning_rate": 1.5652306163376918e-07, "loss": 0.053, "num_tokens": 2192390631.0, "reward": 2.4441964626312256, "reward_std": 0.4495397210121155, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13636787235736847, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 923.2701416015625, "completions/mean_terminated_length": 785.1453857421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.855361994566086, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13136577836924435, "kl": 0.03045654296875, "learning_rate": 1.5636073604311911e-07, "loss": 0.0627, "num_tokens": 2192871920.0, "reward": 2.4659600257873535, "reward_std": 0.39481469988822937, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10073082149028778, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1071.0670166015625, "completions/mean_terminated_length": 808.1529541015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.855575089233392, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.127695395459119, "kl": 0.026336669921875, "learning_rate": 1.5619862830167116e-07, "loss": 0.1324, "num_tokens": 2193428942.0, "reward": 2.3543527126312256, "reward_std": 0.4791010320186615, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9458705186843872, "rewards/tag_count_reward/std": 0.17854657769203186, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1148.6138916015625, "completions/mean_terminated_length": 883.4768676757812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8557881839006979, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.33400718996469586, "kl": 0.027435302734375, "learning_rate": 1.5603673849913945e-07, "loss": 0.0968, "num_tokens": 2194019073.0, "reward": 2.28125, "reward_std": 0.4129432439804077, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16370916366577148, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 977.9152221679688, "completions/mean_terminated_length": 779.7512817382812, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.8560012785680038, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13713082830861867, "kl": 0.0303955078125, "learning_rate": 1.5587506672511763e-07, "loss": 0.0679, "num_tokens": 2194528011.0, "reward": 2.490513563156128, "reward_std": 0.448030561208725, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1121.134033203125, "completions/mean_terminated_length": 854.7930908203125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8562143732353098, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1299457331066092, "kl": 0.024169921875, "learning_rate": 1.5571361306907883e-07, "loss": 0.0621, "num_tokens": 2195104647.0, "reward": 2.3002233505249023, "reward_std": 0.42757025361061096, "rewards/accuracy_reward/mean": 0.39814814925193787, "rewards/accuracy_reward/std": 0.49008384346961975, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12177752703428268, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 954.7410888671875, "completions/mean_terminated_length": 772.53125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8564274679026157, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13356461116682083, "kl": 0.030059814453125, "learning_rate": 1.555523776203751e-07, "loss": 0.0637, "num_tokens": 2195598595.0, "reward": 2.474888563156128, "reward_std": 0.4388851523399353, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416090607643127, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1065.2210693359375, "completions/mean_terminated_length": 797.1903686523438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8566405625699217, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11481451753985222, "kl": 0.02593994140625, "learning_rate": 1.5539136046823783e-07, "loss": 0.0956, "num_tokens": 2196149654.0, "reward": 2.365513563156128, "reward_std": 0.47923123836517334, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.18044531345367432, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1098.0045166015625, "completions/mean_terminated_length": 881.9780883789062, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.8568536572372276, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12223149494792568, "kl": 0.0257568359375, "learning_rate": 1.5523056170177796e-07, "loss": 0.0706, "num_tokens": 2196714216.0, "reward": 2.4458706378936768, "reward_std": 0.40560591220855713, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12315750867128372, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 981.8281860351562, "completions/mean_terminated_length": 739.3836059570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8570667519045336, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12752028982107383, "kl": 0.02850341796875, "learning_rate": 1.5506998140998516e-07, "loss": 0.0994, "num_tokens": 2197222251.0, "reward": 2.41796875, "reward_std": 0.37595540285110474, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 945.5313110351562, "completions/mean_terminated_length": 758.42822265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8572798465718395, "frac_reward_zero_std": 0.25, "grad_norm": 0.13000654982989374, "kl": 0.026336669921875, "learning_rate": 1.5490961968172827e-07, "loss": 0.0437, "num_tokens": 2197715737.0, "reward": 2.3995537757873535, "reward_std": 0.340129554271698, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13318143784999847, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 917.3772583007812, "completions/mean_terminated_length": 728.9401245117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8574929412391455, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11967357958522949, "kl": 0.030059814453125, "learning_rate": 1.5474947660575528e-07, "loss": 0.0346, "num_tokens": 2198205186.0, "reward": 2.486049175262451, "reward_std": 0.356396347284317, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12130890041589737, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 942.1652221679688, "completions/mean_terminated_length": 757.859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8577060359064514, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14682389463246903, "kl": 0.029754638671875, "learning_rate": 1.545895522706932e-07, "loss": 0.0814, "num_tokens": 2198700252.0, "reward": 2.4698662757873535, "reward_std": 0.4171362817287445, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13763903081417084, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1107.4888916015625, "completions/mean_terminated_length": 850.98583984375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8579191305737573, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.45531164519502776, "kl": 0.0255126953125, "learning_rate": 1.5442984676504795e-07, "loss": 0.1193, "num_tokens": 2199280679.0, "reward": 2.3565850257873535, "reward_std": 0.43994763493537903, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1008.13623046875, "completions/mean_terminated_length": 799.0482788085938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8581322252410634, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11304294686787422, "kl": 0.026153564453125, "learning_rate": 1.5427036017720398e-07, "loss": 0.0506, "num_tokens": 2199799348.0, "reward": 2.4972100257873535, "reward_std": 0.39899393916130066, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 950.26123046875, "completions/mean_terminated_length": 763.9608764648438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8583453199083693, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12212884101555303, "kl": 0.02569580078125, "learning_rate": 1.5411109259542526e-07, "loss": 0.0683, "num_tokens": 2200299513.0, "reward": 2.489955425262451, "reward_std": 0.4126487672328949, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1302192062139511, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1007.4531860351562, "completions/mean_terminated_length": 738.5477905273438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8585584145756753, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1318979964231388, "kl": 0.025848388671875, "learning_rate": 1.5395204410785395e-07, "loss": 0.0859, "num_tokens": 2200834964.0, "reward": 2.4308037757873535, "reward_std": 0.3784793019294739, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13629460334777832, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 956.7567138671875, "completions/mean_terminated_length": 764.8582763671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8587715092429812, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1293461130825448, "kl": 0.02734375, "learning_rate": 1.5379321480251156e-07, "loss": 0.0693, "num_tokens": 2201339367.0, "reward": 2.364955425262451, "reward_std": 0.33711546659469604, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 973.1785888671875, "completions/mean_terminated_length": 780.8421630859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8589846039102872, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14945580670777017, "kl": 0.030029296875, "learning_rate": 1.5363460476729764e-07, "loss": 0.0928, "num_tokens": 2201844359.0, "reward": 2.505580425262451, "reward_std": 0.4236098825931549, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12247265130281448, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 998.0982666015625, "completions/mean_terminated_length": 776.767578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8591976985775931, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1279465561155509, "kl": 0.0279541015625, "learning_rate": 1.534762140899907e-07, "loss": 0.0639, "num_tokens": 2202359331.0, "reward": 2.4308037757873535, "reward_std": 0.431491881608963, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 961.154052734375, "completions/mean_terminated_length": 759.88623046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.859410793244899, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1443480419386777, "kl": 0.030059814453125, "learning_rate": 1.5331804285824802e-07, "loss": 0.0629, "num_tokens": 2202854872.0, "reward": 2.4135046005249023, "reward_std": 0.42508289217948914, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1328611671924591, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 892.2589721679688, "completions/mean_terminated_length": 730.5139770507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.859623887912205, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12953588915893907, "kl": 0.03033447265625, "learning_rate": 1.5316009115960522e-07, "loss": 0.0586, "num_tokens": 2203327676.0, "reward": 2.4520089626312256, "reward_std": 0.4128343164920807, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14360485970973969, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 989.91748046875, "completions/mean_terminated_length": 816.776611328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8598369825795109, "frac_reward_zero_std": 0.0, "grad_norm": 0.1388375747613047, "kl": 0.029083251953125, "learning_rate": 1.5300235908147646e-07, "loss": 0.0611, "num_tokens": 2203844151.0, "reward": 2.458705425262451, "reward_std": 0.5002565383911133, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16460275650024414, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 970.0178833007812, "completions/mean_terminated_length": 773.7625732421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8600500772468169, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1397498921715073, "kl": 0.026153564453125, "learning_rate": 1.5284484671115426e-07, "loss": 0.0779, "num_tokens": 2204349807.0, "reward": 2.3431921005249023, "reward_std": 0.3593369722366333, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10744724422693253, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1001.029052734375, "completions/mean_terminated_length": 807.1455078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8602631719141228, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.6891131788309965, "kl": 0.034027099609375, "learning_rate": 1.5268755413580997e-07, "loss": 0.0802, "num_tokens": 2204867116.0, "reward": 2.505580425262451, "reward_std": 0.4100479483604431, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1236090287566185, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1031.794677734375, "completions/mean_terminated_length": 786.8919677734375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8604762665814288, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12196421656826449, "kl": 0.02734375, "learning_rate": 1.525304814424927e-07, "loss": 0.0866, "num_tokens": 2205397152.0, "reward": 2.3872768878936768, "reward_std": 0.43952134251594543, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.19385726749897003, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1068.8795166015625, "completions/mean_terminated_length": 849.5136108398438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8606893612487347, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1257717609565754, "kl": 0.0247802734375, "learning_rate": 1.523736287181302e-07, "loss": 0.0791, "num_tokens": 2205954490.0, "reward": 2.3231027126312256, "reward_std": 0.37952110171318054, "rewards/accuracy_reward/mean": 0.4236111044883728, "rewards/accuracy_reward/std": 0.4947032034397125, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1149359717965126, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 916.2500610351562, "completions/mean_terminated_length": 731.0545043945312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8609024559160408, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1383920336400597, "kl": 0.030181884765625, "learning_rate": 1.5221699604952856e-07, "loss": 0.1064, "num_tokens": 2206430010.0, "reward": 2.4308037757873535, "reward_std": 0.4416918158531189, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14480386674404144, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 911.74560546875, "completions/mean_terminated_length": 756.0151977539062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8611155505833467, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12813956034508955, "kl": 0.028961181640625, "learning_rate": 1.520605835233719e-07, "loss": 0.046, "num_tokens": 2206903432.0, "reward": 2.4988839626312256, "reward_std": 0.3404530882835388, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09750169515609741, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 984.029052734375, "completions/mean_terminated_length": 773.5106811523438, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.8613286452506526, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14022510955042372, "kl": 0.0284423828125, "learning_rate": 1.5190439122622257e-07, "loss": 0.0865, "num_tokens": 2207416837.0, "reward": 2.4168527126312256, "reward_std": 0.4671056568622589, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.17125900089740753, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 985.716552734375, "completions/mean_terminated_length": 802.1806640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8615417399179586, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11890608651185142, "kl": 0.027618408203125, "learning_rate": 1.5174841924452115e-07, "loss": 0.0549, "num_tokens": 2207922374.0, "reward": 2.5362725257873535, "reward_std": 0.3677121102809906, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11109180748462677, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1043.609375, "completions/mean_terminated_length": 818.5819702148438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8617548345852645, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1353989344355892, "kl": 0.02630615234375, "learning_rate": 1.5159266766458598e-07, "loss": 0.0791, "num_tokens": 2208453927.0, "reward": 2.3861608505249023, "reward_std": 0.4289137125015259, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1457662582397461, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 984.763427734375, "completions/mean_terminated_length": 770.9758911132812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8619679292525705, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13333855636475003, "kl": 0.026214599609375, "learning_rate": 1.5143713657261396e-07, "loss": 0.1046, "num_tokens": 2208973277.0, "reward": 2.4921875, "reward_std": 0.4110495150089264, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559208810329437, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1120.899658203125, "completions/mean_terminated_length": 871.3966064453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8621810239198764, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1127224049547244, "kl": 0.024688720703125, "learning_rate": 1.5128182605467928e-07, "loss": 0.0746, "num_tokens": 2209552000.0, "reward": 2.4112725257873535, "reward_std": 0.4183811545372009, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 882.4152221679688, "completions/mean_terminated_length": 712.4961547851562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8623941185871824, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1220537501274085, "kl": 0.03216552734375, "learning_rate": 1.511267361967347e-07, "loss": 0.0441, "num_tokens": 2210016330.0, "reward": 2.5005581378936768, "reward_std": 0.4250258505344391, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1139.107177734375, "completions/mean_terminated_length": 857.4035034179688, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.8626072132544883, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12305608743846658, "kl": 0.024627685546875, "learning_rate": 1.5097186708461047e-07, "loss": 0.0721, "num_tokens": 2210598410.0, "reward": 2.4252233505249023, "reward_std": 0.44480597972869873, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14835961163043976, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 935.1451416015625, "completions/mean_terminated_length": 725.5623168945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8628203079217942, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13155395863622382, "kl": 0.0291748046875, "learning_rate": 1.5081721880401483e-07, "loss": 0.0701, "num_tokens": 2211091259.0, "reward": 2.411830425262451, "reward_std": 0.3875403106212616, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12805373966693878, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 895.747802734375, "completions/mean_terminated_length": 710.6709594726562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8630334025891002, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14001871328866852, "kl": 0.031829833984375, "learning_rate": 1.5066279144053372e-07, "loss": 0.1001, "num_tokens": 2211562970.0, "reward": 2.431919813156128, "reward_std": 0.4137674868106842, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13559210300445557, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 909.5982666015625, "completions/mean_terminated_length": 750.2799072265625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8632464972564061, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1543959946973897, "kl": 0.0308837890625, "learning_rate": 1.505085850796308e-07, "loss": 0.0609, "num_tokens": 2212036614.0, "reward": 2.5345983505249023, "reward_std": 0.37925994396209717, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 943.8951416015625, "completions/mean_terminated_length": 779.6948852539062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8634595919237121, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13608682055528662, "kl": 0.03076171875, "learning_rate": 1.503545998066477e-07, "loss": 0.0756, "num_tokens": 2212527511.0, "reward": 2.553013563156128, "reward_std": 0.42629748582839966, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13878051936626434, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 912.0625610351562, "completions/mean_terminated_length": 729.606201171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.863672686591018, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14699163514256142, "kl": 0.028564453125, "learning_rate": 1.5020083570680333e-07, "loss": 0.0792, "num_tokens": 2212998979.0, "reward": 2.4771206378936768, "reward_std": 0.4592069685459137, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14177079498767853, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1068.8482666015625, "completions/mean_terminated_length": 842.89013671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.863885781258324, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12627791405463323, "kl": 0.025146484375, "learning_rate": 1.5004729286519444e-07, "loss": 0.0932, "num_tokens": 2213546815.0, "reward": 2.349330425262451, "reward_std": 0.47795534133911133, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16375111043453217, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1068.921875, "completions/mean_terminated_length": 852.8310546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.86409887592563, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12607570905945265, "kl": 0.026458740234375, "learning_rate": 1.4989397136679513e-07, "loss": 0.0806, "num_tokens": 2214106588.0, "reward": 2.3191964626312256, "reward_std": 0.4842020571231842, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1554640233516693, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 1020.15185546875, "completions/mean_terminated_length": 765.3370361328125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.864311970592936, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1148854580066887, "kl": 0.024993896484375, "learning_rate": 1.4974087129645716e-07, "loss": 0.0745, "num_tokens": 2214635120.0, "reward": 2.4112725257873535, "reward_std": 0.3464162051677704, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493200302124, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523062229156, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 944.8281860351562, "completions/mean_terminated_length": 747.41845703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8645250652602419, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13450169531886802, "kl": 0.028411865234375, "learning_rate": 1.495879927389097e-07, "loss": 0.0683, "num_tokens": 2215123251.0, "reward": 2.58203125, "reward_std": 0.3914824426174164, "rewards/accuracy_reward/mean": 0.6696428656578064, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1048.919677734375, "completions/mean_terminated_length": 797.754150390625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8647381599275478, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12004838276080283, "kl": 0.024200439453125, "learning_rate": 1.4943533577875927e-07, "loss": 0.0733, "num_tokens": 2215665343.0, "reward": 2.377232313156128, "reward_std": 0.4044736325740814, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.14033812284469604, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 928.3482666015625, "completions/mean_terminated_length": 727.989501953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8649512545948538, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12773242726153758, "kl": 0.028564453125, "learning_rate": 1.4928290050048994e-07, "loss": 0.0741, "num_tokens": 2216149323.0, "reward": 2.4268975257873535, "reward_std": 0.4526580572128296, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517839670181274, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 977.122802734375, "completions/mean_terminated_length": 792.1021118164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8651643492621597, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1489828260238368, "kl": 0.02642822265625, "learning_rate": 1.4913068698846287e-07, "loss": 0.0681, "num_tokens": 2216659266.0, "reward": 2.2472100257873535, "reward_std": 0.38584235310554504, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1058.872802734375, "completions/mean_terminated_length": 810.20947265625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8653774439294657, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11983759944217881, "kl": 0.026092529296875, "learning_rate": 1.4897869532691669e-07, "loss": 0.0421, "num_tokens": 2217200825.0, "reward": 2.4832589626312256, "reward_std": 0.4228314757347107, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 951.15185546875, "completions/mean_terminated_length": 737.6320190429688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8655905385967716, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12969772532712365, "kl": 0.03009033203125, "learning_rate": 1.4882692559996705e-07, "loss": 0.0501, "num_tokens": 2217689229.0, "reward": 2.4056921005249023, "reward_std": 0.39541152119636536, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13254131376743317, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 976.779052734375, "completions/mean_terminated_length": 768.2479858398438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8658036332640776, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.137910207218541, "kl": 0.027587890625, "learning_rate": 1.4867537789160683e-07, "loss": 0.1017, "num_tokens": 2218192426.0, "reward": 2.4129464626312256, "reward_std": 0.4871104657649994, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 962.6875610351562, "completions/mean_terminated_length": 768.4736938476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8660167279313835, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15066555273569057, "kl": 0.029510498046875, "learning_rate": 1.4852405228570635e-07, "loss": 0.1312, "num_tokens": 2218688814.0, "reward": 2.4140625, "reward_std": 0.4451924264431, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13455694913864136, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 927.9553833007812, "completions/mean_terminated_length": 741.28125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8662298225986894, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1420741969141825, "kl": 0.02874755859375, "learning_rate": 1.4837294886601236e-07, "loss": 0.0326, "num_tokens": 2219174778.0, "reward": 2.5200893878936768, "reward_std": 0.39433616399765015, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09872953593730927, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1126.0804443359375, "completions/mean_terminated_length": 826.04736328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8664429172659954, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11289294799220716, "kl": 0.0234375, "learning_rate": 1.4822206771614936e-07, "loss": 0.0444, "num_tokens": 2219752942.0, "reward": 2.3683037757873535, "reward_std": 0.4109860360622406, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 923.0469360351562, "completions/mean_terminated_length": 749.0850219726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8666560119333013, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1296488949600664, "kl": 0.03076171875, "learning_rate": 1.4807140891961838e-07, "loss": 0.0631, "num_tokens": 2220234931.0, "reward": 2.35546875, "reward_std": 0.4363895654678345, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.1657317876815796, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1020.357177734375, "completions/mean_terminated_length": 779.7245483398438, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.8668691066006073, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.12993528168241564, "kl": 0.024444580078125, "learning_rate": 1.4792097255979759e-07, "loss": 0.0922, "num_tokens": 2220753747.0, "reward": 2.453125, "reward_std": 0.3721867799758911, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 949.7232666015625, "completions/mean_terminated_length": 770.0051879882812, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8670822012679132, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15242613932852447, "kl": 0.0289306640625, "learning_rate": 1.4777075871994193e-07, "loss": 0.1055, "num_tokens": 2221248423.0, "reward": 2.4380581378936768, "reward_std": 0.501368522644043, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16202186048030853, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 988.4375610351562, "completions/mean_terminated_length": 771.9677124023438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8672952959352193, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12854758094670285, "kl": 0.027618408203125, "learning_rate": 1.4762076748318317e-07, "loss": 0.0671, "num_tokens": 2221761339.0, "reward": 2.48046875, "reward_std": 0.4184191823005676, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.12858274579048157, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 920.0469360351562, "completions/mean_terminated_length": 765.4542846679688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8675083906025252, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1466931195253997, "kl": 0.029052734375, "learning_rate": 1.4747099893253029e-07, "loss": 0.1236, "num_tokens": 2222251744.0, "reward": 2.5390625, "reward_std": 0.4405645728111267, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.48100292682647705, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12425371259450912, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1035.5848388671875, "completions/mean_terminated_length": 844.9177856445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8677214852698312, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 6132.149576018426, "kl": 83.02157592773438, "learning_rate": 1.4732145315086843e-07, "loss": 3.4164, "num_tokens": 2222788406.0, "reward": 2.3794643878936768, "reward_std": 0.44413241744041443, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 952.7410888671875, "completions/mean_terminated_length": 753.3403930664062, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.8679345799371371, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1430022607596674, "kl": 0.02838134765625, "learning_rate": 1.4717213022095988e-07, "loss": 0.0693, "num_tokens": 2223279490.0, "reward": 2.5669643878936768, "reward_std": 0.3638642430305481, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09585529565811157, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 929.747802734375, "completions/mean_terminated_length": 789.2637939453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.868147674604443, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12926350850774657, "kl": 0.029693603515625, "learning_rate": 1.470230302254434e-07, "loss": 0.0343, "num_tokens": 2223770865.0, "reward": 2.505580425262451, "reward_std": 0.3987835645675659, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11709482222795486, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1012.4241333007812, "completions/mean_terminated_length": 814.122314453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.868360769271749, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11816358792307348, "kl": 0.028228759765625, "learning_rate": 1.4687415324683445e-07, "loss": 0.045, "num_tokens": 2224292863.0, "reward": 2.4112725257873535, "reward_std": 0.4191451370716095, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13802284002304077, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 882.62060546875, "completions/mean_terminated_length": 709.3077392578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8685738639390549, "frac_reward_zero_std": 0.0, "grad_norm": 0.14272499573444483, "kl": 0.029541015625, "learning_rate": 1.4672549936752505e-07, "loss": 0.1, "num_tokens": 2224753557.0, "reward": 2.451451063156128, "reward_std": 0.46408915519714355, "rewards/accuracy_reward/mean": 0.5694444179534912, "rewards/accuracy_reward/std": 0.495728075504303, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1434776484966278, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1072.8973388671875, "completions/mean_terminated_length": 803.4244995117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8687869586063609, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13346013139901514, "kl": 0.02606201171875, "learning_rate": 1.4657706866978359e-07, "loss": 0.1003, "num_tokens": 2225307287.0, "reward": 2.3058037757873535, "reward_std": 0.48475760221481323, "rewards/accuracy_reward/mean": 0.44212964177131653, "rewards/accuracy_reward/std": 0.4972155690193176, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 959.0826416015625, "completions/mean_terminated_length": 740.1314086914062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8690000532736668, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13354302072630625, "kl": 0.02789306640625, "learning_rate": 1.464288612357553e-07, "loss": 0.0657, "num_tokens": 2225803788.0, "reward": 2.5089287757873535, "reward_std": 0.3907046616077423, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11244472116231918, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1036.1004638671875, "completions/mean_terminated_length": 816.122314453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8692131479409728, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13371107186113698, "kl": 0.02520751953125, "learning_rate": 1.462808771474617e-07, "loss": 0.0432, "num_tokens": 2226339817.0, "reward": 2.4090402126312256, "reward_std": 0.41504529118537903, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1182868480682373, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 969.2567138671875, "completions/mean_terminated_length": 723.9534301757812, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.8694262426082787, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13785561599797685, "kl": 0.02880859375, "learning_rate": 1.4613311648680032e-07, "loss": 0.0602, "num_tokens": 2226840748.0, "reward": 2.4955358505249023, "reward_std": 0.44741523265838623, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.4908071458339691, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15274205803871155, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1023.2120971679688, "completions/mean_terminated_length": 800.4320678710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8696393372755847, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11843753910217387, "kl": 0.027740478515625, "learning_rate": 1.4598557933554573e-07, "loss": 0.0715, "num_tokens": 2227365995.0, "reward": 2.453125, "reward_std": 0.4121227562427521, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1017.27685546875, "completions/mean_terminated_length": 829.6253662109375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.8698524319428906, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12539022499083557, "kl": 0.027862548828125, "learning_rate": 1.4583826577534823e-07, "loss": 0.0661, "num_tokens": 2227891543.0, "reward": 2.4185268878936768, "reward_std": 0.41020217537879944, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.10627461224794388, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 947.2277221679688, "completions/mean_terminated_length": 824.3126220703125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8700655266101965, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13509524652411065, "kl": 0.02880859375, "learning_rate": 1.456911758877348e-07, "loss": 0.0863, "num_tokens": 2228385101.0, "reward": 2.4168527126312256, "reward_std": 0.41891607642173767, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1086.341552734375, "completions/mean_terminated_length": 851.2694702148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8702786212775026, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12745857006299832, "kl": 0.026641845703125, "learning_rate": 1.4554430975410822e-07, "loss": 0.0828, "num_tokens": 2228942534.0, "reward": 2.381138563156128, "reward_std": 0.4991455674171448, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.161164328455925, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1087.680908203125, "completions/mean_terminated_length": 849.6072387695312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8704917159448085, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11049052548951625, "kl": 0.024383544921875, "learning_rate": 1.4539766745574772e-07, "loss": 0.0472, "num_tokens": 2229505063.0, "reward": 2.302455425262451, "reward_std": 0.42382556200027466, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11709482222795486, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 944.341552734375, "completions/mean_terminated_length": 770.3798828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8707048106121145, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1283670473306525, "kl": 0.03057861328125, "learning_rate": 1.4525124907380866e-07, "loss": 0.0917, "num_tokens": 2229997872.0, "reward": 2.439174175262451, "reward_std": 0.42247307300567627, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15578390657901764, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1008.1317138671875, "completions/mean_terminated_length": 871.5833129882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8709179052794204, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10213503831026428, "kl": 0.026458740234375, "learning_rate": 1.451050546893224e-07, "loss": 0.0416, "num_tokens": 2230522539.0, "reward": 2.4324777126312256, "reward_std": 0.36738675832748413, "rewards/accuracy_reward/mean": 0.5185185074806213, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10210946202278137, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1063.7857666015625, "completions/mean_terminated_length": 773.6416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8711309999467264, "frac_reward_zero_std": 0.0, "grad_norm": 0.12922279668882436, "kl": 0.0257568359375, "learning_rate": 1.4495908438319626e-07, "loss": 0.0711, "num_tokens": 2231074075.0, "reward": 2.3364956378936768, "reward_std": 0.4675809442996979, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13572438061237335, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1024.953125, "completions/mean_terminated_length": 771.3286743164062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8713440946140323, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11734569491377006, "kl": 0.024688720703125, "learning_rate": 1.448133382362136e-07, "loss": 0.0598, "num_tokens": 2231610262.0, "reward": 2.3956475257873535, "reward_std": 0.435901403427124, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1439727544784546, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 873.9844360351562, "completions/mean_terminated_length": 713.0786743164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8715571892813382, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.18346112444198404, "kl": 0.0357666015625, "learning_rate": 1.4466781632903403e-07, "loss": 0.0644, "num_tokens": 2232072623.0, "reward": 2.4799108505249023, "reward_std": 0.49145856499671936, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 884.6875610351562, "completions/mean_terminated_length": 731.9293212890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8717702839486442, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1313626534332317, "kl": 0.028594970703125, "learning_rate": 1.4452251874219245e-07, "loss": 0.1078, "num_tokens": 2232542643.0, "reward": 2.501674175262451, "reward_std": 0.36803072690963745, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09687710553407669, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 817.5156860351562, "completions/mean_terminated_length": 662.93212890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8719833786159501, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14064417787711353, "kl": 0.031463623046875, "learning_rate": 1.4437744555610008e-07, "loss": 0.1035, "num_tokens": 2232978778.0, "reward": 2.5279018878936768, "reward_std": 0.3254278302192688, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09750169515609741, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 903.2500610351562, "completions/mean_terminated_length": 746.3552856445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8721964732832561, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12331589925576429, "kl": 0.03125, "learning_rate": 1.4423259685104384e-07, "loss": 0.0639, "num_tokens": 2233450026.0, "reward": 2.529017925262451, "reward_std": 0.3627117872238159, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13686132431030273, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 908.5045166015625, "completions/mean_terminated_length": 774.9476318359375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.872409567950562, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13902813412145534, "kl": 0.030731201171875, "learning_rate": 1.4408797270718645e-07, "loss": 0.1151, "num_tokens": 2233921852.0, "reward": 2.532924175262451, "reward_std": 0.45692914724349976, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12428762763738632, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 943.46435546875, "completions/mean_terminated_length": 735.4482421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.872622662617868, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1432911726822847, "kl": 0.032684326171875, "learning_rate": 1.4394357320456623e-07, "loss": 0.0652, "num_tokens": 2234420684.0, "reward": 2.4034600257873535, "reward_std": 0.37642720341682434, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519906997680664, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 887.747802734375, "completions/mean_terminated_length": 676.5145263671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8728357572851739, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.15051861371665326, "kl": 0.030609130859375, "learning_rate": 1.437993984230973e-07, "loss": 0.1104, "num_tokens": 2234885371.0, "reward": 2.513392925262451, "reward_std": 0.4087888300418854, "rewards/accuracy_reward/mean": 0.6273148059844971, "rewards/accuracy_reward/std": 0.48407992720603943, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13731664419174194, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 848.2254638671875, "completions/mean_terminated_length": 680.3180541992188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8730488519524799, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1532225688412467, "kl": 0.03289794921875, "learning_rate": 1.4365544844256922e-07, "loss": 0.0984, "num_tokens": 2235332240.0, "reward": 2.5033483505249023, "reward_std": 0.388235867023468, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492350101471, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15374813973903656, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 910.5870971679688, "completions/mean_terminated_length": 714.0706787109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8732619466197858, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.27450434845069077, "kl": 0.036773681640625, "learning_rate": 1.4351172334264756e-07, "loss": 0.0773, "num_tokens": 2235817511.0, "reward": 2.5267858505249023, "reward_std": 0.46538981795310974, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.13318143784999847, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 875.1674194335938, "completions/mean_terminated_length": 740.962646484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8734750412870917, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13123571764869235, "kl": 0.03033447265625, "learning_rate": 1.4336822320287284e-07, "loss": 0.0093, "num_tokens": 2236276386.0, "reward": 2.4252233505249023, "reward_std": 0.36274856328964233, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10841450095176697, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 930.5089721679688, "completions/mean_terminated_length": 764.3179931640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8736881359543978, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13298803285086572, "kl": 0.028778076171875, "learning_rate": 1.4322494810266167e-07, "loss": 0.0796, "num_tokens": 2236761606.0, "reward": 2.4598214626312256, "reward_std": 0.3826166093349457, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11444751173257828, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1037.40185546875, "completions/mean_terminated_length": 804.1868286132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8739012306217037, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13875004870749313, "kl": 0.026153564453125, "learning_rate": 1.4308189812130572e-07, "loss": 0.0843, "num_tokens": 2237287418.0, "reward": 2.2572546005249023, "reward_std": 0.4233165681362152, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.18169322609901428, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 885.4241333007812, "completions/mean_terminated_length": 719.341796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8741143252890097, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.5584138699317963, "kl": 0.04290771484375, "learning_rate": 1.429390733379723e-07, "loss": 0.0939, "num_tokens": 2237756776.0, "reward": 2.4598214626312256, "reward_std": 0.4481230080127716, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.49405214190483093, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14285963773727417, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1153.30810546875, "completions/mean_terminated_length": 844.330322265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8743274199563156, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10554906003292863, "kl": 0.02264404296875, "learning_rate": 1.4279647383170387e-07, "loss": 0.0648, "num_tokens": 2238340706.0, "reward": 2.3738839626312256, "reward_std": 0.37633055448532104, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13814601302146912, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 903.9219360351562, "completions/mean_terminated_length": 716.7091064453125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8745405146236216, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.18949633495950627, "kl": 0.032623291015625, "learning_rate": 1.4265409968141838e-07, "loss": 0.0799, "num_tokens": 2238823615.0, "reward": 2.478794813156128, "reward_std": 0.3823574185371399, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14160890877246857, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1122.53125, "completions/mean_terminated_length": 870.1307373046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8747536092909275, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1233511403486463, "kl": 0.021728515625, "learning_rate": 1.4251195096590905e-07, "loss": 0.0697, "num_tokens": 2239400349.0, "reward": 2.3470983505249023, "reward_std": 0.38672545552253723, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.12332592159509659, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 945.0670166015625, "completions/mean_terminated_length": 771.2196655273438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8749667039582334, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11512383257623415, "kl": 0.028289794921875, "learning_rate": 1.4237002776384437e-07, "loss": 0.0677, "num_tokens": 2239891019.0, "reward": 2.459263563156128, "reward_std": 0.3542226552963257, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1041.21875, "completions/mean_terminated_length": 828.9783935546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8751797986255394, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12169230665160817, "kl": 0.026123046875, "learning_rate": 1.422283301537679e-07, "loss": 0.0619, "num_tokens": 2240424621.0, "reward": 2.4425225257873535, "reward_std": 0.4512191116809845, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.143477663397789, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1024.7991943359375, "completions/mean_terminated_length": 815.758056640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8753928932928453, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11807054702197434, "kl": 0.02667236328125, "learning_rate": 1.4208685821409839e-07, "loss": 0.0597, "num_tokens": 2240949683.0, "reward": 2.4425225257873535, "reward_std": 0.40510037541389465, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1092.640625, "completions/mean_terminated_length": 900.5442504882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8756059879601513, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12082401118986372, "kl": 0.023101806640625, "learning_rate": 1.4194561202312978e-07, "loss": 0.0987, "num_tokens": 2241508466.0, "reward": 2.361607313156128, "reward_std": 0.4608380198478699, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13058780133724213, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1022.2254638671875, "completions/mean_terminated_length": 829.0424194335938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8758190826274572, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11690834934567863, "kl": 0.02947998046875, "learning_rate": 1.4180459165903106e-07, "loss": 0.0553, "num_tokens": 2242034503.0, "reward": 2.45703125, "reward_std": 0.40497055649757385, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12992528080940247, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1059.5045166015625, "completions/mean_terminated_length": 844.6141357421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8760321772947632, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1257085408036062, "kl": 0.025848388671875, "learning_rate": 1.4166379719984606e-07, "loss": 0.1079, "num_tokens": 2242580041.0, "reward": 2.5111608505249023, "reward_std": 0.4596747159957886, "rewards/accuracy_reward/mean": 0.6342592835426331, "rewards/accuracy_reward/std": 0.4821956753730774, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13376134634017944, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1100.2076416015625, "completions/mean_terminated_length": 861.9357299804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8762452719620691, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11219817615665337, "kl": 0.023834228515625, "learning_rate": 1.415232287234939e-07, "loss": 0.0721, "num_tokens": 2243145110.0, "reward": 2.256138563156128, "reward_std": 0.35672348737716675, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 882.1250610351562, "completions/mean_terminated_length": 718.9618530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8764583666293752, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1233745299271287, "kl": 0.028717041015625, "learning_rate": 1.413828863077684e-07, "loss": 0.0555, "num_tokens": 2243599838.0, "reward": 2.6199777126312256, "reward_std": 0.3767786920070648, "rewards/accuracy_reward/mean": 0.6986607313156128, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1022.96435546875, "completions/mean_terminated_length": 826.6808471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.876671461296681, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12316234667394567, "kl": 0.026092529296875, "learning_rate": 1.4124277003033842e-07, "loss": 0.0538, "num_tokens": 2244126798.0, "reward": 2.325892925262451, "reward_std": 0.41007548570632935, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12717820703983307, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 897.7277221679688, "completions/mean_terminated_length": 743.3873291015625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.876884555963987, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12083076920625277, "kl": 0.028778076171875, "learning_rate": 1.4110287996874745e-07, "loss": 0.0536, "num_tokens": 2244588804.0, "reward": 2.5279018878936768, "reward_std": 0.37047278881073, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12975822389125824, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1002.997802734375, "completions/mean_terminated_length": 732.9410400390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.877097650631293, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.125971283419655, "kl": 0.0242919921875, "learning_rate": 1.4096321620041396e-07, "loss": 0.0312, "num_tokens": 2245110403.0, "reward": 2.33203125, "reward_std": 0.40842893719673157, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.13192765414714813, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 994.8348388671875, "completions/mean_terminated_length": 776.2533569335938, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8773107452985989, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1512076860260538, "kl": 0.029266357421875, "learning_rate": 1.4082377880263138e-07, "loss": 0.1202, "num_tokens": 2245626425.0, "reward": 2.3744421005249023, "reward_std": 0.47978514432907104, "rewards/accuracy_reward/mean": 0.5069444179534912, "rewards/accuracy_reward/std": 0.5005314350128174, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15122966468334198, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 914.7120971679688, "completions/mean_terminated_length": 722.3786010742188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8775238399659049, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13758621002695806, "kl": 0.031005859375, "learning_rate": 1.4068456785256726e-07, "loss": 0.0654, "num_tokens": 2246106008.0, "reward": 2.5340402126312256, "reward_std": 0.36230164766311646, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1119314506649971, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1099.546875, "completions/mean_terminated_length": 790.8787231445312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8777369346332108, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10518328248094859, "kl": 0.0234375, "learning_rate": 1.4054558342726453e-07, "loss": 0.0199, "num_tokens": 2246676573.0, "reward": 2.408482313156128, "reward_std": 0.3364168405532837, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.10325382649898529, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 971.8125610351562, "completions/mean_terminated_length": 751.9462280273438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8779500293005168, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14306732496703112, "kl": 0.02606201171875, "learning_rate": 1.404068256036403e-07, "loss": 0.1091, "num_tokens": 2247188057.0, "reward": 2.4698662757873535, "reward_std": 0.432937890291214, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13911856710910797, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 919.4777221679688, "completions/mean_terminated_length": 734.8103637695312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8781631239678227, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14060157722041228, "kl": 0.030242919921875, "learning_rate": 1.4026829445848642e-07, "loss": 0.0589, "num_tokens": 2247667215.0, "reward": 2.490513563156128, "reward_std": 0.3907771706581116, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13041439652442932, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1021.732177734375, "completions/mean_terminated_length": 730.6132202148438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8783762186351286, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1265082713534206, "kl": 0.024749755859375, "learning_rate": 1.4012999006846926e-07, "loss": 0.0808, "num_tokens": 2248196551.0, "reward": 2.5200893878936768, "reward_std": 0.3904174864292145, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 954.575927734375, "completions/mean_terminated_length": 814.1107788085938, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8785893133024346, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1455699337163834, "kl": 0.028228759765625, "learning_rate": 1.3999191251012964e-07, "loss": 0.1229, "num_tokens": 2248694585.0, "reward": 2.5161831378936768, "reward_std": 0.46627891063690186, "rewards/accuracy_reward/mean": 0.6342592835426331, "rewards/accuracy_reward/std": 0.482195645570755, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1354389488697052, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 917.1317138671875, "completions/mean_terminated_length": 790.8560791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8788024079697405, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11273490131696988, "kl": 0.02911376953125, "learning_rate": 1.3985406185988314e-07, "loss": 0.0853, "num_tokens": 2249166788.0, "reward": 2.6729912757873535, "reward_std": 0.42071303725242615, "rewards/accuracy_reward/mean": 0.7611607313156128, "rewards/accuracy_reward/std": 0.4268510043621063, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11589459329843521, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 951.4442138671875, "completions/mean_terminated_length": 758.611572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8790155026370465, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13527557948745145, "kl": 0.0299072265625, "learning_rate": 1.397164381940193e-07, "loss": 0.0958, "num_tokens": 2249662891.0, "reward": 2.5189733505249023, "reward_std": 0.41511550545692444, "rewards/accuracy_reward/mean": 0.6597222089767456, "rewards/accuracy_reward/std": 0.47435182332992554, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14160890877246857, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1010.8795166015625, "completions/mean_terminated_length": 795.6279907226562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8792285973043524, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13513263123848054, "kl": 0.02630615234375, "learning_rate": 1.3957904158870248e-07, "loss": 0.1093, "num_tokens": 2250184597.0, "reward": 2.4681921005249023, "reward_std": 0.43210306763648987, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1552460640668869, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 987.2076416015625, "completions/mean_terminated_length": 794.0818481445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8794416919716584, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12276265016604013, "kl": 0.027313232421875, "learning_rate": 1.3944187211997104e-07, "loss": 0.0701, "num_tokens": 2250697714.0, "reward": 2.506138563156128, "reward_std": 0.4436274468898773, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 957.30810546875, "completions/mean_terminated_length": 744.9866333007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8796547866389643, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1754091684887157, "kl": 0.031951904296875, "learning_rate": 1.3930492986373784e-07, "loss": 0.0624, "num_tokens": 2251198828.0, "reward": 2.42578125, "reward_std": 0.4366443455219269, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 962.8125610351562, "completions/mean_terminated_length": 795.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8798678813062704, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1283879790766306, "kl": 0.028564453125, "learning_rate": 1.3916821489578996e-07, "loss": 0.0447, "num_tokens": 2251693992.0, "reward": 2.5050225257873535, "reward_std": 0.4213351905345917, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517838180065155, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1091.8951416015625, "completions/mean_terminated_length": 841.4224853515625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8800809759735763, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1120346139421575, "kl": 0.024078369140625, "learning_rate": 1.3903172729178854e-07, "loss": 0.0512, "num_tokens": 2252251657.0, "reward": 2.24609375, "reward_std": 0.3908710777759552, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.143477663397789, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1133.0023193359375, "completions/mean_terminated_length": 824.3612060546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8802940706408822, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11183882074357733, "kl": 0.022552490234375, "learning_rate": 1.3889546712726924e-07, "loss": 0.0664, "num_tokens": 2252830186.0, "reward": 2.3348214626312256, "reward_std": 0.31416070461273193, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.09060624986886978, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 918.3504638671875, "completions/mean_terminated_length": 753.6700439453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8805071653081882, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.39833456886609653, "kl": 0.034423828125, "learning_rate": 1.3875943447764155e-07, "loss": 0.1072, "num_tokens": 2253318279.0, "reward": 2.4285714626312256, "reward_std": 0.48930060863494873, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.15527118742465973, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 960.4219360351562, "completions/mean_terminated_length": 801.8746948242188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8807202599754941, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14112593125284428, "kl": 0.0313720703125, "learning_rate": 1.3862362941818894e-07, "loss": 0.073, "num_tokens": 2253812020.0, "reward": 2.5580358505249023, "reward_std": 0.3961932361125946, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.4776431620121002, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11286582797765732, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 997.372802734375, "completions/mean_terminated_length": 779.3180541992188, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8809333546428001, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13669803972692854, "kl": 0.027801513671875, "learning_rate": 1.384880520240694e-07, "loss": 0.099, "num_tokens": 2254325163.0, "reward": 2.462611675262451, "reward_std": 0.4169662594795227, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1199323832988739, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1008.4754638671875, "completions/mean_terminated_length": 779.0435791015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.881146449310106, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13605609765479867, "kl": 0.02728271484375, "learning_rate": 1.3835270237031439e-07, "loss": 0.0899, "num_tokens": 2254847904.0, "reward": 2.357142925262451, "reward_std": 0.43272456526756287, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15365473926067352, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 903.2388916015625, "completions/mean_terminated_length": 729.61181640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.881359543977412, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12512603042140577, "kl": 0.033477783203125, "learning_rate": 1.382175805318299e-07, "loss": 0.0257, "num_tokens": 2255313739.0, "reward": 2.5284600257873535, "reward_std": 0.32592588663101196, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.11684267967939377, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1053.341552734375, "completions/mean_terminated_length": 887.5651245117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8815726386447179, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11587492375591205, "kl": 0.027923583984375, "learning_rate": 1.3808268658339506e-07, "loss": -0.0068, "num_tokens": 2255856116.0, "reward": 2.3989956378936768, "reward_std": 0.3875787556171417, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1019.638427734375, "completions/mean_terminated_length": 822.7180786132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8817857333120238, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1492363092681286, "kl": 0.026397705078125, "learning_rate": 1.3794802059966378e-07, "loss": 0.0793, "num_tokens": 2256385362.0, "reward": 2.3839287757873535, "reward_std": 0.44679322838783264, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1378791779279709, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 942.1897583007812, "completions/mean_terminated_length": 761.2389526367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8819988279793298, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12533443158549135, "kl": 0.029632568359375, "learning_rate": 1.3781358265516324e-07, "loss": 0.0596, "num_tokens": 2256871495.0, "reward": 2.4693081378936768, "reward_std": 0.4305974543094635, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1328611671924591, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1149.6585693359375, "completions/mean_terminated_length": 864.302978515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8822119226466357, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12655382202218207, "kl": 0.024810791015625, "learning_rate": 1.3767937282429448e-07, "loss": 0.0883, "num_tokens": 2257460926.0, "reward": 2.3113839626312256, "reward_std": 0.5290858745574951, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1599874496459961, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 903.6428833007812, "completions/mean_terminated_length": 736.8184204101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8824250173139417, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13653692157954006, "kl": 0.028839111328125, "learning_rate": 1.375453911813324e-07, "loss": 0.0821, "num_tokens": 2257932878.0, "reward": 2.4453125, "reward_std": 0.44600623846054077, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13507550954818726, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 965.88623046875, "completions/mean_terminated_length": 792.0751342773438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8826381119812476, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12745522914075622, "kl": 0.028594970703125, "learning_rate": 1.374116378004255e-07, "loss": 0.0366, "num_tokens": 2258433611.0, "reward": 2.4799108505249023, "reward_std": 0.43910205364227295, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.08826113492250443, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 890.0357666015625, "completions/mean_terminated_length": 721.2276000976562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8828512066485537, "frac_reward_zero_std": 0.25, "grad_norm": 0.12926332774022947, "kl": 0.02838134765625, "learning_rate": 1.372781127555963e-07, "loss": 0.0392, "num_tokens": 2258899371.0, "reward": 2.5234375, "reward_std": 0.35832515358924866, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13865114748477936, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 980.435302734375, "completions/mean_terminated_length": 789.3973999023438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8830643013158596, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14136847643221295, "kl": 0.028167724609375, "learning_rate": 1.3714481612074047e-07, "loss": 0.0552, "num_tokens": 2259412910.0, "reward": 2.29296875, "reward_std": 0.3875815272331238, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12356231361627579, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1045.4754638671875, "completions/mean_terminated_length": 840.6586303710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8832773959831656, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1276031823902063, "kl": 0.026824951171875, "learning_rate": 1.3701174796962743e-07, "loss": 0.0862, "num_tokens": 2259951667.0, "reward": 2.404576063156128, "reward_std": 0.4412045478820801, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813789844513, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1044.4285888671875, "completions/mean_terminated_length": 802.5706176757812, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8834904906504715, "frac_reward_zero_std": 0.25, "grad_norm": 0.10287085495815296, "kl": 0.024322509765625, "learning_rate": 1.3687890837590044e-07, "loss": 0.0373, "num_tokens": 2260490467.0, "reward": 2.3504464626312256, "reward_std": 0.3876363933086395, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14336557686328888, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1032.575927734375, "completions/mean_terminated_length": 770.1629028320312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8837035853177774, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15021665747205368, "kl": 0.02618408203125, "learning_rate": 1.3674629741307594e-07, "loss": 0.1457, "num_tokens": 2261015557.0, "reward": 2.3950893878936768, "reward_std": 0.509012758731842, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9486607313156128, "rewards/tag_count_reward/std": 0.1797599494457245, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 866.9397583007812, "completions/mean_terminated_length": 708.4683837890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8839166799850834, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13465845816710117, "kl": 0.03302001953125, "learning_rate": 1.3661391515454396e-07, "loss": 0.0526, "num_tokens": 2261477930.0, "reward": 2.3431921005249023, "reward_std": 0.3759476840496063, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1336481273174286, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1144.904052734375, "completions/mean_terminated_length": 914.703125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8841297746523893, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12465350130852852, "kl": 0.0252685546875, "learning_rate": 1.3648176167356782e-07, "loss": 0.0648, "num_tokens": 2262062383.0, "reward": 2.2918527126312256, "reward_std": 0.45645463466644287, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14843006432056427, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1036.2835693359375, "completions/mean_terminated_length": 788.9750366210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8843428693196953, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1319010865255141, "kl": 0.02880859375, "learning_rate": 1.3634983704328457e-07, "loss": 0.0977, "num_tokens": 2262595486.0, "reward": 2.4486608505249023, "reward_std": 0.4144447445869446, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1605832874774933, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1058.33935546875, "completions/mean_terminated_length": 799.0760498046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8845559639870012, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12893099671427674, "kl": 0.0269775390625, "learning_rate": 1.3621814133670435e-07, "loss": 0.0786, "num_tokens": 2263141558.0, "reward": 2.3833706378936768, "reward_std": 0.5027891397476196, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3096405565738678, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.1675894856452942, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 888.8058471679688, "completions/mean_terminated_length": 752.940185546875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8847690586543072, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13333443124834937, "kl": 0.0296630859375, "learning_rate": 1.3608667462671044e-07, "loss": 0.0247, "num_tokens": 2263608431.0, "reward": 2.5513393878936768, "reward_std": 0.39516928791999817, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10649171471595764, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1060.055908203125, "completions/mean_terminated_length": 832.0687255859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.8849821533216131, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12767413539345768, "kl": 0.025848388671875, "learning_rate": 1.3595543698605988e-07, "loss": 0.0738, "num_tokens": 2264155592.0, "reward": 2.3331475257873535, "reward_std": 0.5176364779472351, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1064.375, "completions/mean_terminated_length": 781.72412109375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8851952479889191, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12215680475396902, "kl": 0.024200439453125, "learning_rate": 1.3582442848738252e-07, "loss": 0.1043, "num_tokens": 2264699040.0, "reward": 2.4034600257873535, "reward_std": 0.4074050486087799, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11540208011865616, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 994.8482666015625, "completions/mean_terminated_length": 762.40869140625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.885408342656225, "frac_reward_zero_std": 0.25, "grad_norm": 0.13498585074124453, "kl": 0.028656005859375, "learning_rate": 1.3569364920318155e-07, "loss": 0.1081, "num_tokens": 2265209804.0, "reward": 2.3046875, "reward_std": 0.3655664026737213, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1464626044034958, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 921.5692138671875, "completions/mean_terminated_length": 760.6505126953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.8856214373235309, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13126511488096904, "kl": 0.030853271484375, "learning_rate": 1.3556309920583332e-07, "loss": 0.0309, "num_tokens": 2265688603.0, "reward": 2.53515625, "reward_std": 0.331646203994751, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.13925647735595703, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1056.62060546875, "completions/mean_terminated_length": 810.8468017578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.885834531990837, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13179544745997673, "kl": 0.02508544921875, "learning_rate": 1.3543277856758712e-07, "loss": 0.0838, "num_tokens": 2266231265.0, "reward": 2.4246652126312256, "reward_std": 0.3900611996650696, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13064393401145935, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 965.69873046875, "completions/mean_terminated_length": 788.5947875976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8860476266581429, "frac_reward_zero_std": 0.25, "grad_norm": 0.11949250115432297, "kl": 0.028472900390625, "learning_rate": 1.3530268736056565e-07, "loss": 0.1004, "num_tokens": 2266733130.0, "reward": 2.489955425262451, "reward_std": 0.33382484316825867, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9854910969734192, "rewards/tag_count_reward/std": 0.09785954654216766, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1132.7857666015625, "completions/mean_terminated_length": 869.7930908203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8862607213254489, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11266642714720972, "kl": 0.02276611328125, "learning_rate": 1.351728256567644e-07, "loss": 0.0916, "num_tokens": 2267313290.0, "reward": 2.3125, "reward_std": 0.40958693623542786, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.13325640559196472, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1103.375, "completions/mean_terminated_length": 795.9526977539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8864738159927548, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11452594457792649, "kl": 0.026031494140625, "learning_rate": 1.3504319352805179e-07, "loss": 0.0482, "num_tokens": 2267875778.0, "reward": 2.3560268878936768, "reward_std": 0.3648747205734253, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1502326875925064, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1005.013427734375, "completions/mean_terminated_length": 785.1405639648438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8866869106600608, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1314363778744809, "kl": 0.03143310546875, "learning_rate": 1.3491379104616938e-07, "loss": 0.082, "num_tokens": 2268390472.0, "reward": 2.4012277126312256, "reward_std": 0.42521148920059204, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 949.99560546875, "completions/mean_terminated_length": 732.7433471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8869000053273667, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14584501560191895, "kl": 0.033966064453125, "learning_rate": 1.347846182827314e-07, "loss": 0.0976, "num_tokens": 2268881462.0, "reward": 2.5546875, "reward_std": 0.4590505361557007, "rewards/accuracy_reward/mean": 0.6919642686843872, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15063102543354034, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 959.0000610351562, "completions/mean_terminated_length": 743.5294189453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8871130999946726, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12008285243699954, "kl": 0.0279541015625, "learning_rate": 1.3465567530922526e-07, "loss": 0.047, "num_tokens": 2269379190.0, "reward": 2.3800225257873535, "reward_std": 0.39347752928733826, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1363026201725006, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 929.6585083007812, "completions/mean_terminated_length": 763.341064453125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8873261946619786, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13550099736132898, "kl": 0.0335693359375, "learning_rate": 1.345269621970108e-07, "loss": 0.0773, "num_tokens": 2269862813.0, "reward": 2.4213171005249023, "reward_std": 0.4631577134132385, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1277548223733902, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 999.1183471679688, "completions/mean_terminated_length": 833.7907104492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8875392893292845, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11426661027477648, "kl": 0.027313232421875, "learning_rate": 1.3439847901732116e-07, "loss": 0.0267, "num_tokens": 2270381458.0, "reward": 2.4737725257873535, "reward_std": 0.42879223823547363, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13064393401145935, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 988.7277221679688, "completions/mean_terminated_length": 824.9226684570312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8877523839965905, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1423658838220071, "kl": 0.031768798828125, "learning_rate": 1.3427022584126173e-07, "loss": 0.0705, "num_tokens": 2270894264.0, "reward": 2.419642925262451, "reward_std": 0.4749682545661926, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13989263772964478, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 813.6517944335938, "completions/mean_terminated_length": 672.407958984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8879654786638964, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1555839215612168, "kl": 0.03387451171875, "learning_rate": 1.341422027398109e-07, "loss": 0.0946, "num_tokens": 2271325644.0, "reward": 2.5245537757873535, "reward_std": 0.40580102801322937, "rewards/accuracy_reward/mean": 0.6272321343421936, "rewards/accuracy_reward/std": 0.4840816557407379, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13376134634017944, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1005.6406860351562, "completions/mean_terminated_length": 799.3984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8881785733312024, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12830997387760776, "kl": 0.027069091796875, "learning_rate": 1.340144097838197e-07, "loss": 0.071, "num_tokens": 2271850923.0, "reward": 2.4598214626312256, "reward_std": 0.46587109565734863, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.4929138123989105, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12223286926746368, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 990.94873046875, "completions/mean_terminated_length": 801.7921142578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8883916679985083, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1413313435076299, "kl": 0.026824951171875, "learning_rate": 1.3388684704401155e-07, "loss": 0.0847, "num_tokens": 2272356068.0, "reward": 2.3470983505249023, "reward_std": 0.4011780321598053, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709373772144318, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 912.43310546875, "completions/mean_terminated_length": 736.8298950195312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8886047626658143, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13113559089731008, "kl": 0.029266357421875, "learning_rate": 1.33759514590983e-07, "loss": 0.0393, "num_tokens": 2272831542.0, "reward": 2.502232313156128, "reward_std": 0.4183984100818634, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12733517587184906, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 984.8170166015625, "completions/mean_terminated_length": 764.1563110351562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8888178573331202, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13617647948160563, "kl": 0.031707763671875, "learning_rate": 1.3363241249520251e-07, "loss": 0.0501, "num_tokens": 2273352500.0, "reward": 2.385044813156128, "reward_std": 0.4162093698978424, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1468711644411087, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 936.8326416015625, "completions/mean_terminated_length": 720.5253295898438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8890309520004261, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1398165106656057, "kl": 0.029998779296875, "learning_rate": 1.3350554082701155e-07, "loss": 0.1178, "num_tokens": 2273838745.0, "reward": 2.396763563156128, "reward_std": 0.420979768037796, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14078108966350555, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 919.7835083007812, "completions/mean_terminated_length": 738.5673217773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8892440466677322, "frac_reward_zero_std": 0.25, "grad_norm": 0.11619482855785468, "kl": 0.03070068359375, "learning_rate": 1.333788996566238e-07, "loss": 0.0285, "num_tokens": 2274314504.0, "reward": 2.4760046005249023, "reward_std": 0.414402574300766, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 900.8973388671875, "completions/mean_terminated_length": 709.7135620117188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8894571413350381, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15130816759108975, "kl": 0.030914306640625, "learning_rate": 1.3325248905412544e-07, "loss": 0.1496, "num_tokens": 2274788858.0, "reward": 2.4224331378936768, "reward_std": 0.4770428240299225, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9514508843421936, "rewards/tag_count_reward/std": 0.16217589378356934, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1084.185302734375, "completions/mean_terminated_length": 817.8319091796875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8896702360023441, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12034071773916155, "kl": 0.0262451171875, "learning_rate": 1.3312630908947514e-07, "loss": 0.0371, "num_tokens": 2275342941.0, "reward": 2.411830425262451, "reward_std": 0.44236335158348083, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16740036010742188, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 954.4063110351562, "completions/mean_terminated_length": 775.4545288085938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.88988333066965, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14857972524808974, "kl": 0.028778076171875, "learning_rate": 1.330003598325037e-07, "loss": 0.1171, "num_tokens": 2275836819.0, "reward": 2.5072546005249023, "reward_std": 0.40550389885902405, "rewards/accuracy_reward/mean": 0.6087962985038757, "rewards/accuracy_reward/std": 0.4885856807231903, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12292414903640747, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1038.9442138671875, "completions/mean_terminated_length": 845.720703125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.890096425336956, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13286117642704873, "kl": 0.027435302734375, "learning_rate": 1.3287464135291456e-07, "loss": 0.0837, "num_tokens": 2276380602.0, "reward": 2.458705425262451, "reward_std": 0.47483500838279724, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15247619152069092, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1104.9910888671875, "completions/mean_terminated_length": 861.2921142578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8903095200042619, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12262046174993887, "kl": 0.0257568359375, "learning_rate": 1.3274915372028317e-07, "loss": 0.0722, "num_tokens": 2276950390.0, "reward": 2.478236675262451, "reward_std": 0.47717219591140747, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14732414484024048, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1044.727783203125, "completions/mean_terminated_length": 842.9973754882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8905226146715678, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 1.0667453980886112, "kl": 0.122467041015625, "learning_rate": 1.3262389700405743e-07, "loss": 0.0924, "num_tokens": 2277492076.0, "reward": 2.431919813156128, "reward_std": 0.4180252254009247, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13661938905715942, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1017.013427734375, "completions/mean_terminated_length": 832.5211181640625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8907357093388738, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1278579701484868, "kl": 0.02581787109375, "learning_rate": 1.3249887127355721e-07, "loss": 0.093, "num_tokens": 2278020962.0, "reward": 2.4073662757873535, "reward_std": 0.41762349009513855, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09892533719539642, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1108.69873046875, "completions/mean_terminated_length": 855.9121704101562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8909488040061797, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.18512011249742813, "kl": 0.030548095703125, "learning_rate": 1.3237407659797485e-07, "loss": 0.0576, "num_tokens": 2278586555.0, "reward": 2.3833706378936768, "reward_std": 0.45720839500427246, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15488377213478088, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1070.859375, "completions/mean_terminated_length": 838.7210083007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8911618986734857, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1232918356046186, "kl": 0.025421142578125, "learning_rate": 1.3224951304637446e-07, "loss": 0.0733, "num_tokens": 2279142396.0, "reward": 2.5418527126312256, "reward_std": 0.4559517800807953, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.482267826795578, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13469025492668152, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1097.2054443359375, "completions/mean_terminated_length": 834.4501342773438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8913749933407916, "frac_reward_zero_std": 0.0, "grad_norm": 0.12874870656372073, "kl": 0.027496337890625, "learning_rate": 1.321251806876925e-07, "loss": 0.0459, "num_tokens": 2279704552.0, "reward": 2.4129464626312256, "reward_std": 0.5138823986053467, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.15647254884243011, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 927.0870971679688, "completions/mean_terminated_length": 715.9867553710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8915880880080976, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1365260904839435, "kl": 0.027984619140625, "learning_rate": 1.3200107959073752e-07, "loss": 0.08, "num_tokens": 2280194831.0, "reward": 2.2310268878936768, "reward_std": 0.4414485692977905, "rewards/accuracy_reward/mean": 0.3325892984867096, "rewards/accuracy_reward/std": 0.47166749835014343, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.146973118185997, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1041.4420166015625, "completions/mean_terminated_length": 795.3944702148438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8918011826754035, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14192262125918254, "kl": 0.02801513671875, "learning_rate": 1.3187720982418993e-07, "loss": 0.1003, "num_tokens": 2280728101.0, "reward": 2.421875, "reward_std": 0.5066932439804077, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.18345820903778076, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1152.946533203125, "completions/mean_terminated_length": 843.8438720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8920142773427095, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12033127255993122, "kl": 0.023101806640625, "learning_rate": 1.3175357145660204e-07, "loss": 0.0813, "num_tokens": 2281316733.0, "reward": 2.4263393878936768, "reward_std": 0.431247740983963, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11286582797765732, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1077.997802734375, "completions/mean_terminated_length": 827.3230590820312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8922273720100155, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13569091351186136, "kl": 0.026092529296875, "learning_rate": 1.3163016455639832e-07, "loss": 0.1356, "num_tokens": 2281872172.0, "reward": 2.3443081378936768, "reward_std": 0.513056755065918, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.17603272199630737, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1031.9598388671875, "completions/mean_terminated_length": 817.767578125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8924404666773214, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11088941502325582, "kl": 0.025634765625, "learning_rate": 1.3150698919187504e-07, "loss": 0.0584, "num_tokens": 2282401114.0, "reward": 2.396763563156128, "reward_std": 0.4460963010787964, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518688440323, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1115.3482666015625, "completions/mean_terminated_length": 860.9886474609375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8926535613446274, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11464826896376981, "kl": 0.023590087890625, "learning_rate": 1.313840454312004e-07, "loss": 0.0708, "num_tokens": 2282972374.0, "reward": 2.2611608505249023, "reward_std": 0.44804778695106506, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17468811571598053, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1154.46435546875, "completions/mean_terminated_length": 827.5609741210938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8928666560119333, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.29259013530090344, "kl": 0.025634765625, "learning_rate": 1.3126133334241417e-07, "loss": 0.1053, "num_tokens": 2283564054.0, "reward": 2.217076063156128, "reward_std": 0.4451143741607666, "rewards/accuracy_reward/mean": 0.3147321343421936, "rewards/accuracy_reward/std": 0.4649282693862915, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 925.8035888671875, "completions/mean_terminated_length": 731.916259765625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8930797506792393, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.27182220454498246, "kl": 0.03485107421875, "learning_rate": 1.3113885299342834e-07, "loss": 0.1074, "num_tokens": 2284042638.0, "reward": 2.51171875, "reward_std": 0.40785545110702515, "rewards/accuracy_reward/mean": 0.5982142686843872, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1258051097393036, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 966.4129638671875, "completions/mean_terminated_length": 786.1484375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.8932928453465452, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1418838796836582, "kl": 0.02880859375, "learning_rate": 1.3101660445202623e-07, "loss": 0.093, "num_tokens": 2284545559.0, "reward": 2.5005581378936768, "reward_std": 0.4155113101005554, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561822891235352, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 973.4152221679688, "completions/mean_terminated_length": 714.4431762695312, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8935059400138512, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12938872114589273, "kl": 0.02685546875, "learning_rate": 1.3089458778586318e-07, "loss": 0.0599, "num_tokens": 2285046257.0, "reward": 2.421875, "reward_std": 0.3593243658542633, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11321921646595001, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1034.571533203125, "completions/mean_terminated_length": 850.0686645507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8937190346811571, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11765186524065456, "kl": 0.026153564453125, "learning_rate": 1.3077280306246593e-07, "loss": 0.0601, "num_tokens": 2285580977.0, "reward": 2.46484375, "reward_std": 0.47264814376831055, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801211535930634, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 974.0045166015625, "completions/mean_terminated_length": 814.2821044921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.893932129348463, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13825517255564443, "kl": 0.02789306640625, "learning_rate": 1.3065125034923304e-07, "loss": 0.0749, "num_tokens": 2286081107.0, "reward": 2.52734375, "reward_std": 0.48276105523109436, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.8861607313156128, "rewards/format_reward/std": 0.31797102093696594, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.12802813947200775, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1046.446533203125, "completions/mean_terminated_length": 794.6591796875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.894145224015769, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12304409933832433, "kl": 0.024383544921875, "learning_rate": 1.3052992971343486e-07, "loss": 0.1059, "num_tokens": 2286624107.0, "reward": 2.41015625, "reward_std": 0.4155735373497009, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1096.185302734375, "completions/mean_terminated_length": 870.0635986328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.8943583186830749, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13258420481586983, "kl": 0.025787353515625, "learning_rate": 1.3040884122221276e-07, "loss": 0.0356, "num_tokens": 2287188590.0, "reward": 2.204799175262451, "reward_std": 0.43173640966415405, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 896.154052734375, "completions/mean_terminated_length": 653.3324584960938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8945714133503809, "frac_reward_zero_std": 0.25, "grad_norm": 0.14441045907185743, "kl": 0.0303955078125, "learning_rate": 1.3028798494258004e-07, "loss": 0.0779, "num_tokens": 2287658419.0, "reward": 2.4921875, "reward_std": 0.2945016026496887, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11141301691532135, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 969.8973388671875, "completions/mean_terminated_length": 793.4805297851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8947845080176868, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12714605252413247, "kl": 0.027099609375, "learning_rate": 1.301673609414215e-07, "loss": 0.0551, "num_tokens": 2288160101.0, "reward": 2.53515625, "reward_std": 0.39811643958091736, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.09791535139083862, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1040.462158203125, "completions/mean_terminated_length": 853.8809204101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8949976026849928, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1390015289080822, "kl": 0.02569580078125, "learning_rate": 1.3004696928549322e-07, "loss": 0.1007, "num_tokens": 2288699492.0, "reward": 2.4441964626312256, "reward_std": 0.4293304979801178, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.225421741604805, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13376134634017944, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 938.15185546875, "completions/mean_terminated_length": 838.2384643554688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8952106973522987, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11586176869558541, "kl": 0.029266357421875, "learning_rate": 1.2992681004142276e-07, "loss": 0.0195, "num_tokens": 2289188232.0, "reward": 2.517857313156128, "reward_std": 0.31761831045150757, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10383255779743195, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 989.9397583007812, "completions/mean_terminated_length": 783.9706420898438, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.8954237920196048, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1388401179758574, "kl": 0.028717041015625, "learning_rate": 1.2980688327570906e-07, "loss": 0.0926, "num_tokens": 2289702941.0, "reward": 2.3839287757873535, "reward_std": 0.4524340033531189, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14767222106456757, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1012.341552734375, "completions/mean_terminated_length": 773.3434448242188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8956368866869107, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12290577045198806, "kl": 0.02691650390625, "learning_rate": 1.2968718905472242e-07, "loss": 0.0618, "num_tokens": 2290223958.0, "reward": 2.467076063156128, "reward_std": 0.4275207817554474, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151521027088165, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 947.2098388671875, "completions/mean_terminated_length": 760.3916625976562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8958499813542166, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13044974650150393, "kl": 0.029541015625, "learning_rate": 1.2956772744470455e-07, "loss": 0.029, "num_tokens": 2290714692.0, "reward": 2.4877233505249023, "reward_std": 0.4086536169052124, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.09564017504453659, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 740.0067138671875, "completions/mean_terminated_length": 625.7160034179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8960630760215226, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15158626577138942, "kl": 0.03448486328125, "learning_rate": 1.2944849851176803e-07, "loss": 0.0345, "num_tokens": 2291109543.0, "reward": 2.5357143878936768, "reward_std": 0.40671274065971375, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11652304977178574, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 962.8995971679688, "completions/mean_terminated_length": 782.0494995117188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8962761706888285, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12247905169907082, "kl": 0.026519775390625, "learning_rate": 1.2932950232189722e-07, "loss": 0.034, "num_tokens": 2291608138.0, "reward": 2.4168527126312256, "reward_std": 0.42413970828056335, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14945264160633087, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1094.384033203125, "completions/mean_terminated_length": 847.94384765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8964892653561345, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1214223562153993, "kl": 0.02349853515625, "learning_rate": 1.292107389409473e-07, "loss": 0.0653, "num_tokens": 2292170886.0, "reward": 2.3878350257873535, "reward_std": 0.45826229453086853, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519908487796783, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1057.1160888671875, "completions/mean_terminated_length": 808.0111694335938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8967023600234404, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1152694385859775, "kl": 0.027252197265625, "learning_rate": 1.290922084346447e-07, "loss": 0.0516, "num_tokens": 2292720426.0, "reward": 2.3839287757873535, "reward_std": 0.4220777750015259, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10649171471595764, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1043.8326416015625, "completions/mean_terminated_length": 801.8309936523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8969154546907464, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.22215838622224887, "kl": 0.032562255859375, "learning_rate": 1.289739108685869e-07, "loss": 0.0555, "num_tokens": 2293261375.0, "reward": 2.4190850257873535, "reward_std": 0.40214914083480835, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14027473330497742, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1113.638427734375, "completions/mean_terminated_length": 813.2094116210938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8971285493580523, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11475254776176785, "kl": 0.022491455078125, "learning_rate": 1.2885584630824267e-07, "loss": 0.0438, "num_tokens": 2293826573.0, "reward": 2.2667412757873535, "reward_std": 0.38323456048965454, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15391045808792114, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 990.3326416015625, "completions/mean_terminated_length": 770.8167114257812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.8973416440253583, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.145933542545102, "kl": 0.027740478515625, "learning_rate": 1.2873801481895168e-07, "loss": 0.0918, "num_tokens": 2294347810.0, "reward": 2.4698662757873535, "reward_std": 0.4441182613372803, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15645259618759155, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1084.5692138671875, "completions/mean_terminated_length": 832.1774291992188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.8975547386926642, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14802286626860592, "kl": 0.028717041015625, "learning_rate": 1.286204164659247e-07, "loss": 0.103, "num_tokens": 2294903041.0, "reward": 2.3973214626312256, "reward_std": 0.42042165994644165, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 967.4063110351562, "completions/mean_terminated_length": 767.2962646484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.8977678333599701, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14647362729737298, "kl": 0.028167724609375, "learning_rate": 1.2850305131424326e-07, "loss": 0.0867, "num_tokens": 2295400615.0, "reward": 2.3895089626312256, "reward_std": 0.42311400175094604, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1421017199754715, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 942.1406860351562, "completions/mean_terminated_length": 793.759521484375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8979809280272761, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13343624231106255, "kl": 0.02838134765625, "learning_rate": 1.2838591942886003e-07, "loss": 0.0815, "num_tokens": 2295892534.0, "reward": 2.5708706378936768, "reward_std": 0.4099407494068146, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14321638643741608, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1033.32373046875, "completions/mean_terminated_length": 839.02392578125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.898194022694582, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12198398701812983, "kl": 0.02728271484375, "learning_rate": 1.2826902087459878e-07, "loss": 0.0769, "num_tokens": 2296426871.0, "reward": 2.470424175262451, "reward_std": 0.4267212748527527, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13254131376743317, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1004.6406860351562, "completions/mean_terminated_length": 837.0543823242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.898407117361888, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13228505168562116, "kl": 0.030029296875, "learning_rate": 1.281523557161536e-07, "loss": 0.0957, "num_tokens": 2296950182.0, "reward": 2.39453125, "reward_std": 0.44787293672561646, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.16029441356658936, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 898.6250610351562, "completions/mean_terminated_length": 682.1644287109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.898620212029194, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14147386848775725, "kl": 0.030609130859375, "learning_rate": 1.280359240180898e-07, "loss": 0.0806, "num_tokens": 2297423646.0, "reward": 2.4921875, "reward_std": 0.4439813196659088, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.17418356239795685, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 939.2366333007812, "completions/mean_terminated_length": 733.9100341796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.8988333066965, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.130985492853666, "kl": 0.0306396484375, "learning_rate": 1.2791972584484353e-07, "loss": 0.0597, "num_tokens": 2297914568.0, "reward": 2.458705425262451, "reward_std": 0.4370371401309967, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14160890877246857, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 878.1629638671875, "completions/mean_terminated_length": 741.0499267578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8990464013638059, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12699886854049647, "kl": 0.0333251953125, "learning_rate": 1.2780376126072154e-07, "loss": 0.0773, "num_tokens": 2298379569.0, "reward": 2.6607143878936768, "reward_std": 0.4718056619167328, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42408111691474915, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12399725615978241, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1017.4085083007812, "completions/mean_terminated_length": 769.0387573242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8992594960311118, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1489353905178799, "kl": 0.029541015625, "learning_rate": 1.2768803032990128e-07, "loss": 0.0937, "num_tokens": 2298903352.0, "reward": 2.41015625, "reward_std": 0.4729800820350647, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.17231273651123047, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1004.57373046875, "completions/mean_terminated_length": 827.4908447265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8994725906984178, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.138638854902198, "kl": 0.029449462890625, "learning_rate": 1.27572533116431e-07, "loss": 0.0541, "num_tokens": 2299436633.0, "reward": 2.4140625, "reward_std": 0.47697728872299194, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15063102543354034, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 956.1785888671875, "completions/mean_terminated_length": 784.0827026367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8996856853657237, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1303104478676641, "kl": 0.02838134765625, "learning_rate": 1.2745726968422956e-07, "loss": 0.0699, "num_tokens": 2299927321.0, "reward": 2.6294643878936768, "reward_std": 0.4414198398590088, "rewards/accuracy_reward/mean": 0.6986607313156128, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12672585248947144, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1093.5, "completions/mean_terminated_length": 826.239990234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8998987800330297, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11705117625360466, "kl": 0.025665283203125, "learning_rate": 1.2734224009708657e-07, "loss": 0.0766, "num_tokens": 2300492041.0, "reward": 2.30078125, "reward_std": 0.44247496128082275, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1579248607158661, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 991.7701416015625, "completions/mean_terminated_length": 825.2842407226562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9001118747003356, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12788168234712893, "kl": 0.026031494140625, "learning_rate": 1.2722744441866184e-07, "loss": 0.0829, "num_tokens": 2301008386.0, "reward": 2.4681921005249023, "reward_std": 0.3792712390422821, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12359262257814407, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 957.60498046875, "completions/mean_terminated_length": 720.5625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9003249693676416, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1394535427903369, "kl": 0.030242919921875, "learning_rate": 1.2711288271248621e-07, "loss": 0.1288, "num_tokens": 2301505361.0, "reward": 2.388392925262451, "reward_std": 0.4781411290168762, "rewards/accuracy_reward/mean": 0.5115740895271301, "rewards/accuracy_reward/std": 0.5004456043243408, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15274205803871155, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1188.97998046875, "completions/mean_terminated_length": 874.7042236328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9005380640349475, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11048458935527253, "kl": 0.0238037109375, "learning_rate": 1.2699855504196075e-07, "loss": 0.0613, "num_tokens": 2302103448.0, "reward": 2.3699777126312256, "reward_std": 0.4487333595752716, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16516588628292084, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1006.6495971679688, "completions/mean_terminated_length": 810.5331420898438, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9007511587022535, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14691857776548145, "kl": 0.029876708984375, "learning_rate": 1.268844614703571e-07, "loss": 0.0902, "num_tokens": 2302619339.0, "reward": 2.532924175262451, "reward_std": 0.4915497601032257, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.13675068318843842, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 908.904052734375, "completions/mean_terminated_length": 759.3257446289062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9009642533695594, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.2034812647119121, "kl": 0.03277587890625, "learning_rate": 1.2677060206081726e-07, "loss": 0.0617, "num_tokens": 2303088880.0, "reward": 2.423549175262451, "reward_std": 0.34090951085090637, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11702418327331543, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 861.5826416015625, "completions/mean_terminated_length": 698.9771728515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9011773480368653, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13046261929793862, "kl": 0.033233642578125, "learning_rate": 1.2665697687635375e-07, "loss": 0.0445, "num_tokens": 2303538789.0, "reward": 2.494419813156128, "reward_std": 0.3952323794364929, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.12485509365797043, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 881.5803833007812, "completions/mean_terminated_length": 711.5396118164062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9013904427041713, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12463176794736495, "kl": 0.028564453125, "learning_rate": 1.2654358597984938e-07, "loss": 0.0381, "num_tokens": 2304001001.0, "reward": 2.556919813156128, "reward_std": 0.34749072790145874, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.4852356016635895, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12062390148639679, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1037.357177734375, "completions/mean_terminated_length": 824.302734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9016035373714772, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1265952066009801, "kl": 0.026611328125, "learning_rate": 1.2643042943405734e-07, "loss": 0.089, "num_tokens": 2304534969.0, "reward": 2.517857313156128, "reward_std": 0.44530680775642395, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1054.921875, "completions/mean_terminated_length": 815.5927734375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9018166320387833, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1318053868713785, "kl": 0.02850341796875, "learning_rate": 1.2631750730160113e-07, "loss": 0.1109, "num_tokens": 2305071254.0, "reward": 2.43359375, "reward_std": 0.4442411959171295, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14294590055942535, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 877.4219360351562, "completions/mean_terminated_length": 733.6666870117188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9020297267060892, "frac_reward_zero_std": 0.25, "grad_norm": 0.1190393386220355, "kl": 0.031097412109375, "learning_rate": 1.262048196449745e-07, "loss": 0.0448, "num_tokens": 2305537315.0, "reward": 2.4715402126312256, "reward_std": 0.32439255714416504, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09811913222074509, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1020.6183471679688, "completions/mean_terminated_length": 773.0221557617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9022428213733952, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13664299771269292, "kl": 0.025970458984375, "learning_rate": 1.2609236652654143e-07, "loss": 0.1146, "num_tokens": 2306062968.0, "reward": 2.4481027126312256, "reward_std": 0.45905932784080505, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.15174883604049683, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 858.1741333007812, "completions/mean_terminated_length": 763.5614624023438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9024559160407011, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1271575627244067, "kl": 0.03289794921875, "learning_rate": 1.2598014800853616e-07, "loss": 0.0141, "num_tokens": 2306518950.0, "reward": 2.4994421005249023, "reward_std": 0.4050199091434479, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518688440323, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 935.5647583007812, "completions/mean_terminated_length": 715.4572143554688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.902669010708007, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14287386451259723, "kl": 0.028564453125, "learning_rate": 1.2586816415306294e-07, "loss": 0.0953, "num_tokens": 2307005299.0, "reward": 2.427455425262451, "reward_std": 0.448410302400589, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396106719971, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13661938905715942, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 974.5000610351562, "completions/mean_terminated_length": 762.0962524414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.902882105375313, "frac_reward_zero_std": 0.0, "grad_norm": 0.1327412533063871, "kl": 0.027435302734375, "learning_rate": 1.2575641502209642e-07, "loss": 0.0703, "num_tokens": 2307507363.0, "reward": 2.404576063156128, "reward_std": 0.4565046727657318, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15764793753623962, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 907.107177734375, "completions/mean_terminated_length": 740.7877197265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9030952000426189, "frac_reward_zero_std": 0.0, "grad_norm": 0.14755042340416133, "kl": 0.0318603515625, "learning_rate": 1.2564490067748103e-07, "loss": 0.1489, "num_tokens": 2307980611.0, "reward": 2.431919813156128, "reward_std": 0.5226327180862427, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.1676388382911682, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 905.169677734375, "completions/mean_terminated_length": 748.5380859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9033082947099249, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1343186060310149, "kl": 0.02880859375, "learning_rate": 1.255336211809316e-07, "loss": 0.0577, "num_tokens": 2308454927.0, "reward": 2.4213171005249023, "reward_std": 0.41429373621940613, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11371295899152756, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1082.305908203125, "completions/mean_terminated_length": 832.744384765625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9035213893772308, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3314695454139338, "kl": 0.02947998046875, "learning_rate": 1.2542257659403267e-07, "loss": 0.0599, "num_tokens": 2309013784.0, "reward": 2.4140625, "reward_std": 0.4254540801048279, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1286761611700058, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1055.368408203125, "completions/mean_terminated_length": 849.3504028320312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9037344840445368, "frac_reward_zero_std": 0.0, "grad_norm": 0.14433420910004904, "kl": 0.028289794921875, "learning_rate": 1.2531176697823886e-07, "loss": 0.0826, "num_tokens": 2309554301.0, "reward": 2.3130581378936768, "reward_std": 0.5043104887008667, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14921022951602936, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1008.185302734375, "completions/mean_terminated_length": 771.7342529296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9039475787118427, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13982994218370837, "kl": 0.0263671875, "learning_rate": 1.252011923948751e-07, "loss": 0.111, "num_tokens": 2310075376.0, "reward": 2.4107143878936768, "reward_std": 0.4051538109779358, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.16231535375118256, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 982.74560546875, "completions/mean_terminated_length": 795.4172973632812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9041606733791487, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11036421730209296, "kl": 0.0260009765625, "learning_rate": 1.2509085290513563e-07, "loss": 0.048, "num_tokens": 2310584558.0, "reward": 2.4347100257873535, "reward_std": 0.3274381756782532, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962118953466415, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1092.8482666015625, "completions/mean_terminated_length": 846.01123046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9043737680464546, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11674024338207285, "kl": 0.02490234375, "learning_rate": 1.249807485700851e-07, "loss": 0.0632, "num_tokens": 2311154058.0, "reward": 2.38671875, "reward_std": 0.3895522654056549, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13748814165592194, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 939.6495971679688, "completions/mean_terminated_length": 771.5449829101562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9045868627137605, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13673097248726682, "kl": 0.029754638671875, "learning_rate": 1.248708794506578e-07, "loss": 0.0699, "num_tokens": 2311642765.0, "reward": 2.421875, "reward_std": 0.3794919550418854, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09024729579687119, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 959.8839721679688, "completions/mean_terminated_length": 781.8285522460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9047999573810666, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14706659730544427, "kl": 0.029052734375, "learning_rate": 1.2476124560765787e-07, "loss": 0.1089, "num_tokens": 2312143897.0, "reward": 2.4955358505249023, "reward_std": 0.5117605328559875, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.167940154671669, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1013.04248046875, "completions/mean_terminated_length": 798.2398681640625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9050130520483725, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11162120965158873, "kl": 0.02569580078125, "learning_rate": 1.2465184710175923e-07, "loss": 0.0904, "num_tokens": 2312660572.0, "reward": 2.521205425262451, "reward_std": 0.3844025731086731, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.216333270072937, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14505796134471893, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1138.9598388671875, "completions/mean_terminated_length": 904.039306640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9052261467156785, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.10849229028488235, "kl": 0.022613525390625, "learning_rate": 1.2454268399350553e-07, "loss": 0.0655, "num_tokens": 2313240586.0, "reward": 2.330357313156128, "reward_std": 0.44210320711135864, "rewards/accuracy_reward/mean": 0.4652777910232544, "rewards/accuracy_reward/std": 0.499371200799942, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 953.279052734375, "completions/mean_terminated_length": 796.8903198242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9054392413829844, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12638388795980343, "kl": 0.026763916015625, "learning_rate": 1.2443375634331032e-07, "loss": 0.0769, "num_tokens": 2313739751.0, "reward": 2.490513563156128, "reward_std": 0.4028940498828888, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 920.4531860351562, "completions/mean_terminated_length": 718.6815795898438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9056523360502904, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13766801273237217, "kl": 0.03131103515625, "learning_rate": 1.2432506421145674e-07, "loss": 0.0819, "num_tokens": 2314216978.0, "reward": 2.548549175262451, "reward_std": 0.44123563170433044, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12315750867128372, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1052.3460693359375, "completions/mean_terminated_length": 839.184326171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9058654307175963, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11674170474022959, "kl": 0.024322509765625, "learning_rate": 1.2421660765809736e-07, "loss": 0.0246, "num_tokens": 2314755357.0, "reward": 2.3565850257873535, "reward_std": 0.3697391450405121, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12359261512756348, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1059.435302734375, "completions/mean_terminated_length": 768.0086669921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9060785253849022, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11981847439064472, "kl": 0.025146484375, "learning_rate": 1.2410838674325472e-07, "loss": 0.0398, "num_tokens": 2315309600.0, "reward": 2.3392858505249023, "reward_std": 0.40727052092552185, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12782442569732666, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 875.1830444335938, "completions/mean_terminated_length": 769.6009521484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9062916200522082, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14090586009353856, "kl": 0.031219482421875, "learning_rate": 1.2400040152682085e-07, "loss": 0.1263, "num_tokens": 2315763826.0, "reward": 2.5541296005249023, "reward_std": 0.38873523473739624, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835129737854, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11440251022577286, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 923.0156860351562, "completions/mean_terminated_length": 775.2904052734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9065047147195141, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12553222374701267, "kl": 0.032135009765625, "learning_rate": 1.238926520685572e-07, "loss": 0.0282, "num_tokens": 2316244265.0, "reward": 2.5708706378936768, "reward_std": 0.4033043086528778, "rewards/accuracy_reward/mean": 0.6741071343421936, "rewards/accuracy_reward/std": 0.4692314565181732, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 891.4375610351562, "completions/mean_terminated_length": 698.6771240234375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9067178093868201, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13926558369163775, "kl": 0.0289306640625, "learning_rate": 1.2378513842809484e-07, "loss": 0.0684, "num_tokens": 2316710173.0, "reward": 2.4994421005249023, "reward_std": 0.3587874174118042, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692909002304, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1053.6785888671875, "completions/mean_terminated_length": 810.6222534179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.906930904054126, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13950988882714518, "kl": 0.027984619140625, "learning_rate": 1.2367786066493447e-07, "loss": 0.0793, "num_tokens": 2317259133.0, "reward": 2.2924108505249023, "reward_std": 0.49396976828575134, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16516682505607605, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1084.0, "completions/mean_terminated_length": 861.5385131835938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.907143998721432, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12308345998433272, "kl": 0.023468017578125, "learning_rate": 1.235708188384461e-07, "loss": 0.0848, "num_tokens": 2317818477.0, "reward": 2.392299175262451, "reward_std": 0.45165103673934937, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519908487796783, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1014.6808471679688, "completions/mean_terminated_length": 826.5567626953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9073570933887379, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12103463382609936, "kl": 0.027069091796875, "learning_rate": 1.2346401300786898e-07, "loss": 0.0657, "num_tokens": 2318347150.0, "reward": 2.4765625, "reward_std": 0.4353918135166168, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1259699910879135, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 890.5803833007812, "completions/mean_terminated_length": 735.281005859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.907570188056044, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12506846704969493, "kl": 0.029296875, "learning_rate": 1.2335744323231218e-07, "loss": 0.0792, "num_tokens": 2318811570.0, "reward": 2.5385046005249023, "reward_std": 0.39858993887901306, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.11946304887533188, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 921.0245971679688, "completions/mean_terminated_length": 760.0280151367188, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9077832827233498, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15408457322376723, "kl": 0.029937744140625, "learning_rate": 1.2325110957075382e-07, "loss": 0.0828, "num_tokens": 2319288493.0, "reward": 2.5111608505249023, "reward_std": 0.449688583612442, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10152243822813034, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1021.9576416015625, "completions/mean_terminated_length": 825.4813842773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9079963773906558, "frac_reward_zero_std": 0.0, "grad_norm": 0.14398715293030606, "kl": 0.026763916015625, "learning_rate": 1.2314501208204163e-07, "loss": 0.0568, "num_tokens": 2319809098.0, "reward": 2.3822546005249023, "reward_std": 0.48828285932540894, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15371057391166687, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1070.4576416015625, "completions/mean_terminated_length": 844.8709106445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9082094720579618, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12148863810616908, "kl": 0.025634765625, "learning_rate": 1.2303915082489212e-07, "loss": 0.0747, "num_tokens": 2320353831.0, "reward": 2.3950893878936768, "reward_std": 0.4405044913291931, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14088857173919678, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 961.560302734375, "completions/mean_terminated_length": 767.144775390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9084225667252677, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14721812106006513, "kl": 0.031494140625, "learning_rate": 1.229335258578916e-07, "loss": 0.0856, "num_tokens": 2320859458.0, "reward": 2.4447546005249023, "reward_std": 0.4358256459236145, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1240563914179802, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1003.2366333007812, "completions/mean_terminated_length": 786.39892578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9086356613925737, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13401333322048226, "kl": 0.030853271484375, "learning_rate": 1.2282813723949535e-07, "loss": 0.0549, "num_tokens": 2321377452.0, "reward": 2.3973214626312256, "reward_std": 0.44694262742996216, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.15583296120166779, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1005.6785888671875, "completions/mean_terminated_length": 732.6196899414062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9088487560598796, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14578539442791744, "kl": 0.02801513671875, "learning_rate": 1.2272298502802798e-07, "loss": 0.092, "num_tokens": 2321897644.0, "reward": 2.4190850257873535, "reward_std": 0.3997432291507721, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1247187927365303, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 929.2344360351562, "completions/mean_terminated_length": 746.1636352539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9090618507271856, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14716167278936598, "kl": 0.029632568359375, "learning_rate": 1.226180692816831e-07, "loss": 0.0859, "num_tokens": 2322389525.0, "reward": 2.3353796005249023, "reward_std": 0.4462164640426636, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.14720545709133148, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1090.3348388671875, "completions/mean_terminated_length": 818.67626953125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9092749453944915, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11840403755799288, "kl": 0.02435302734375, "learning_rate": 1.2251339005852348e-07, "loss": 0.0576, "num_tokens": 2322957995.0, "reward": 2.3387277126312256, "reward_std": 0.41767117381095886, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14566238224506378, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 900.7433471679688, "completions/mean_terminated_length": 706.0391845703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9094880400617974, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15020398941292937, "kl": 0.03167724609375, "learning_rate": 1.2240894741648132e-07, "loss": 0.0931, "num_tokens": 2323427928.0, "reward": 2.4988839626312256, "reward_std": 0.4546898305416107, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.14203143119812012, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1023.6183471679688, "completions/mean_terminated_length": 843.4776611328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9097011347291034, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11696688299387133, "kl": 0.026123046875, "learning_rate": 1.223047414133574e-07, "loss": 0.0577, "num_tokens": 2323957469.0, "reward": 2.4503350257873535, "reward_std": 0.434961199760437, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1018.138427734375, "completions/mean_terminated_length": 837.0341186523438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9099142293964093, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12419326936492042, "kl": 0.027252197265625, "learning_rate": 1.2220077210682178e-07, "loss": 0.0757, "num_tokens": 2324485291.0, "reward": 2.4810268878936768, "reward_std": 0.40673476457595825, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13915444910526276, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 962.6964721679688, "completions/mean_terminated_length": 778.5065307617188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9101273240637153, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12468537102183236, "kl": 0.0291748046875, "learning_rate": 1.2209703955441358e-07, "loss": 0.0504, "num_tokens": 2324980083.0, "reward": 2.4302456378936768, "reward_std": 0.4462333023548126, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14801737666130066, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 961.450927734375, "completions/mean_terminated_length": 728.8292846679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9103404187310212, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11582106158470883, "kl": 0.0269775390625, "learning_rate": 1.219935438135408e-07, "loss": 0.0902, "num_tokens": 2325472813.0, "reward": 2.4291296005249023, "reward_std": 0.408407986164093, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11919104307889938, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 747.0469360351562, "completions/mean_terminated_length": 622.9951171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9105535133983272, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14938501318552166, "kl": 0.0360107421875, "learning_rate": 1.2189028494148044e-07, "loss": 0.0598, "num_tokens": 2325870082.0, "reward": 2.4810268878936768, "reward_std": 0.3640085756778717, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.0999298095703125, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 923.5201416015625, "completions/mean_terminated_length": 729.2382202148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9107666080656331, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1605242157095266, "kl": 0.03106689453125, "learning_rate": 1.2178726299537836e-07, "loss": 0.085, "num_tokens": 2326358123.0, "reward": 2.4765625, "reward_std": 0.43555253744125366, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1314026117324829, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 950.68310546875, "completions/mean_terminated_length": 797.114501953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9109797027329392, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12827012684405112, "kl": 0.029083251953125, "learning_rate": 1.2168447803224925e-07, "loss": 0.0508, "num_tokens": 2326850141.0, "reward": 2.4715402126312256, "reward_std": 0.3935849368572235, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10092892497777939, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1135.4129638671875, "completions/mean_terminated_length": 856.0496215820312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9111927974002451, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14282375204998463, "kl": 0.024200439453125, "learning_rate": 1.2158193010897695e-07, "loss": 0.1334, "num_tokens": 2327431046.0, "reward": 2.3404018878936768, "reward_std": 0.4842710793018341, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14114972949028015, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1236.5, "completions/mean_terminated_length": 879.0225219726562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.911405892067551, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11805929002768534, "kl": 0.022003173828125, "learning_rate": 1.2147961928231356e-07, "loss": 0.0893, "num_tokens": 2328058022.0, "reward": 2.3231027126312256, "reward_std": 0.41856852173805237, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12428762763738632, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1113.321533203125, "completions/mean_terminated_length": 767.4617919921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.911618986734857, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13612824629957904, "kl": 0.028472900390625, "learning_rate": 1.2137754560888054e-07, "loss": 0.0559, "num_tokens": 2328625654.0, "reward": 2.3621652126312256, "reward_std": 0.431217223405838, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.16706722974777222, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1051.841552734375, "completions/mean_terminated_length": 790.8760375976562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9118320814021629, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13698096561857806, "kl": 0.025848388671875, "learning_rate": 1.2127570914516777e-07, "loss": 0.089, "num_tokens": 2329174783.0, "reward": 2.3309152126312256, "reward_std": 0.41191932559013367, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15693362057209015, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 897.4910888671875, "completions/mean_terminated_length": 726.3897705078125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9120451760694689, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14422054934777795, "kl": 0.03009033203125, "learning_rate": 1.2117410994753396e-07, "loss": 0.0906, "num_tokens": 2329647339.0, "reward": 2.4564733505249023, "reward_std": 0.3915725350379944, "rewards/accuracy_reward/mean": 0.5671296119689941, "rewards/accuracy_reward/std": 0.4960475564002991, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14015565812587738, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 916.747802734375, "completions/mean_terminated_length": 735.0440063476562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9122582707367748, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1252950433684357, "kl": 0.029754638671875, "learning_rate": 1.2107274807220644e-07, "loss": 0.0611, "num_tokens": 2330133578.0, "reward": 2.490513563156128, "reward_std": 0.3581659495830536, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.09647680073976517, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 939.5379638671875, "completions/mean_terminated_length": 764.8191528320312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9124713654040808, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1356447306248167, "kl": 0.028961181640625, "learning_rate": 1.2097162357528126e-07, "loss": 0.1131, "num_tokens": 2330625659.0, "reward": 2.3705358505249023, "reward_std": 0.4865240156650543, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14767222106456757, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 999.71435546875, "completions/mean_terminated_length": 754.2479248046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9126844600713867, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11800086440673914, "kl": 0.028350830078125, "learning_rate": 1.2087073651272314e-07, "loss": 0.0557, "num_tokens": 2331140475.0, "reward": 2.4324777126312256, "reward_std": 0.37189552187919617, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12015076726675034, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1002.6116333007812, "completions/mean_terminated_length": 818.7769165039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9128975547386927, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12836968933771142, "kl": 0.02880859375, "learning_rate": 1.2077008694036527e-07, "loss": 0.0518, "num_tokens": 2331659085.0, "reward": 2.513951063156128, "reward_std": 0.44159072637557983, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13588985800743103, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1032.227783203125, "completions/mean_terminated_length": 783.9277954101562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9131106494059986, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13537101938695856, "kl": 0.026031494140625, "learning_rate": 1.2066967491390963e-07, "loss": 0.0671, "num_tokens": 2332194867.0, "reward": 2.4375, "reward_std": 0.38138601183891296, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.14033812284469604, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1214.69873046875, "completions/mean_terminated_length": 862.8603515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9133237440733045, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10336607880925264, "kl": 0.021759033203125, "learning_rate": 1.205695004889264e-07, "loss": 0.0516, "num_tokens": 2332814188.0, "reward": 2.17578125, "reward_std": 0.4046914279460907, "rewards/accuracy_reward/mean": 0.2790178656578064, "rewards/accuracy_reward/std": 0.449017733335495, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 992.1339721679688, "completions/mean_terminated_length": 789.94677734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.9135368387406105, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.8502097133076573, "kl": 0.030029296875, "learning_rate": 1.204695637208546e-07, "loss": 0.0794, "num_tokens": 2333326328.0, "reward": 2.392857313156128, "reward_std": 0.47749075293540955, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9598214030265808, "rewards/tag_count_reward/std": 0.1605832874774933, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 936.2277221679688, "completions/mean_terminated_length": 757.65283203125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9137499334079164, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12488189365979543, "kl": 0.028228759765625, "learning_rate": 1.203698646650015e-07, "loss": 0.0882, "num_tokens": 2333818062.0, "reward": 2.4135046005249023, "reward_std": 0.4057147800922394, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10190140455961227, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 950.4732666015625, "completions/mean_terminated_length": 780.7525634765625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9139630280752225, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12598054630434816, "kl": 0.02813720703125, "learning_rate": 1.2027040337654287e-07, "loss": 0.0384, "num_tokens": 2334311762.0, "reward": 2.3582589626312256, "reward_std": 0.3835171163082123, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389531940221786, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1036.196533203125, "completions/mean_terminated_length": 763.8980102539062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9141761227425284, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13386836270126953, "kl": 0.026885986328125, "learning_rate": 1.2017117991052303e-07, "loss": 0.0685, "num_tokens": 2334853018.0, "reward": 2.40234375, "reward_std": 0.42113855481147766, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13124457001686096, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1098.32373046875, "completions/mean_terminated_length": 846.1497192382812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9143892174098344, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12084877075261727, "kl": 0.022552490234375, "learning_rate": 1.2007219432185455e-07, "loss": 0.0689, "num_tokens": 2335415099.0, "reward": 2.415736675262451, "reward_std": 0.4345530867576599, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11801212280988693, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1020.497802734375, "completions/mean_terminated_length": 803.8892211914062, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9146023120771403, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12828342832865927, "kl": 0.026824951171875, "learning_rate": 1.1997344666531832e-07, "loss": 0.0489, "num_tokens": 2335944458.0, "reward": 2.4832589626312256, "reward_std": 0.4384419023990631, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09158609062433243, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1045.7098388671875, "completions/mean_terminated_length": 807.5967407226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9148154067444462, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1556944719675982, "kl": 0.030609130859375, "learning_rate": 1.1987493699556365e-07, "loss": 0.0506, "num_tokens": 2336490952.0, "reward": 2.4453125, "reward_std": 0.39094749093055725, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765773296356, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 978.7366333007812, "completions/mean_terminated_length": 784.068603515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9150285014117522, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.39794369173073857, "kl": 0.028778076171875, "learning_rate": 1.1977666536710803e-07, "loss": 0.112, "num_tokens": 2337007042.0, "reward": 2.37890625, "reward_std": 0.402964323759079, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1446475237607956, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 856.1986694335938, "completions/mean_terminated_length": 719.8233642578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9152415960790581, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12127234165709, "kl": 0.03399658203125, "learning_rate": 1.196786318343374e-07, "loss": 0.051, "num_tokens": 2337448267.0, "reward": 2.687500238418579, "reward_std": 0.35360977053642273, "rewards/accuracy_reward/mean": 0.7611607313156128, "rewards/accuracy_reward/std": 0.4268510043621063, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10152244567871094, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1016.2366333007812, "completions/mean_terminated_length": 778.1373901367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9154546907463641, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1289341021294902, "kl": 0.027374267578125, "learning_rate": 1.1958083645150568e-07, "loss": 0.1109, "num_tokens": 2337977125.0, "reward": 2.3671875, "reward_std": 0.5158634185791016, "rewards/accuracy_reward/mean": 0.4709821343421936, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15520283579826355, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1020.2522583007812, "completions/mean_terminated_length": 765.46240234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.91566778541367, "frac_reward_zero_std": 0.0, "grad_norm": 0.14099058378325344, "kl": 0.0260009765625, "learning_rate": 1.1948327927273528e-07, "loss": 0.0911, "num_tokens": 2338503910.0, "reward": 2.3861608505249023, "reward_std": 0.47436875104904175, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1540280133485794, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1019.4777221679688, "completions/mean_terminated_length": 731.4913940429688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.915880880080976, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1357149115479806, "kl": 0.0299072265625, "learning_rate": 1.1938596035201656e-07, "loss": 0.0569, "num_tokens": 2339022428.0, "reward": 2.4559152126312256, "reward_std": 0.4842139184474945, "rewards/accuracy_reward/mean": 0.5892857313156128, "rewards/accuracy_reward/std": 0.4925134479999542, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16430194675922394, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 923.6027221679688, "completions/mean_terminated_length": 708.2925415039062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9160939747482819, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.16919233253088392, "kl": 0.030181884765625, "learning_rate": 1.1928887974320806e-07, "loss": 0.1233, "num_tokens": 2339499194.0, "reward": 2.40625, "reward_std": 0.38667890429496765, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11036036163568497, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 863.0781860351562, "completions/mean_terminated_length": 717.5614013671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9163070694155879, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1450599614166613, "kl": 0.0333251953125, "learning_rate": 1.191920375000365e-07, "loss": 0.0441, "num_tokens": 2339952333.0, "reward": 2.5965402126312256, "reward_std": 0.3840074837207794, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9893973469734192, "rewards/tag_count_reward/std": 0.07496661692857742, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 915.529052734375, "completions/mean_terminated_length": 733.6295166015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9165201640828938, "frac_reward_zero_std": 0.0, "grad_norm": 0.14443095441150788, "kl": 0.031646728515625, "learning_rate": 1.1909543367609663e-07, "loss": 0.0902, "num_tokens": 2340428650.0, "reward": 2.4268975257873535, "reward_std": 0.4631510078907013, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.12691166996955872, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 979.5313110351562, "completions/mean_terminated_length": 736.5643920898438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9167332587501997, "frac_reward_zero_std": 0.0, "grad_norm": 0.5350156622927484, "kl": 0.034820556640625, "learning_rate": 1.189990683248513e-07, "loss": 0.0684, "num_tokens": 2340949384.0, "reward": 2.4185268878936768, "reward_std": 0.4733780026435852, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13033421337604523, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 884.7031860351562, "completions/mean_terminated_length": 676.5342407226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9169463534175057, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14434482262669748, "kl": 0.03131103515625, "learning_rate": 1.1890294149963134e-07, "loss": 0.1253, "num_tokens": 2341405971.0, "reward": 2.455357313156128, "reward_std": 0.4411734342575073, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1378791630268097, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1057.716552734375, "completions/mean_terminated_length": 825.8319702148438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.9171594480848116, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11460955396170243, "kl": 0.025726318359375, "learning_rate": 1.188070532536356e-07, "loss": 0.0574, "num_tokens": 2341957316.0, "reward": 2.435826063156128, "reward_std": 0.36691829562187195, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09911917895078659, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1074.3504638671875, "completions/mean_terminated_length": 798.1575927734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9173725427521177, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1297835240496017, "kl": 0.024688720703125, "learning_rate": 1.187114036399309e-07, "loss": 0.0679, "num_tokens": 2342507281.0, "reward": 2.3950893878936768, "reward_std": 0.38043105602264404, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1156.8504638671875, "completions/mean_terminated_length": 877.222900390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9175856374194236, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11543840863198242, "kl": 0.023590087890625, "learning_rate": 1.1861599271145194e-07, "loss": 0.0584, "num_tokens": 2343091406.0, "reward": 2.2427456378936768, "reward_std": 0.4806744456291199, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16570167243480682, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 999.232177734375, "completions/mean_terminated_length": 811.5579223632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9177987320867296, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13559579595572716, "kl": 0.026763916015625, "learning_rate": 1.1852082052100142e-07, "loss": 0.0722, "num_tokens": 2343613174.0, "reward": 2.37890625, "reward_std": 0.4389708638191223, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13519908487796783, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 942.1250610351562, "completions/mean_terminated_length": 723.3155517578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9180118267540355, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1336596778119635, "kl": 0.02728271484375, "learning_rate": 1.184258871212497e-07, "loss": 0.0772, "num_tokens": 2344111438.0, "reward": 2.4815850257873535, "reward_std": 0.4398045539855957, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14372976124286652, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1065.3035888671875, "completions/mean_terminated_length": 797.2954711914062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9182249214213414, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11788909961402896, "kl": 0.025848388671875, "learning_rate": 1.1833119256473539e-07, "loss": 0.0559, "num_tokens": 2344662374.0, "reward": 2.415736675262451, "reward_std": 0.3968111276626587, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10550089925527573, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1028.203125, "completions/mean_terminated_length": 836.1458740234375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9184380160886474, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13206484514646272, "kl": 0.025390625, "learning_rate": 1.182367369038646e-07, "loss": 0.0682, "num_tokens": 2345191185.0, "reward": 2.4224331378936768, "reward_std": 0.45625853538513184, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390644073486, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.126290425658226, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 940.638427734375, "completions/mean_terminated_length": 772.6837768554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9186511107559533, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1344194588287428, "kl": 0.029266357421875, "learning_rate": 1.1814252019091115e-07, "loss": 0.107, "num_tokens": 2345686591.0, "reward": 2.53125, "reward_std": 0.4150005578994751, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10152245312929153, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1006.107177734375, "completions/mean_terminated_length": 755.0138549804688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9188642054232593, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12089684824738905, "kl": 0.0262451171875, "learning_rate": 1.1804854247801699e-07, "loss": 0.0439, "num_tokens": 2346200831.0, "reward": 2.53515625, "reward_std": 0.4017380475997925, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10703980922698975, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 932.0982666015625, "completions/mean_terminated_length": 782.36962890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9190773000905652, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12793975353724973, "kl": 0.027984619140625, "learning_rate": 1.1795480381719142e-07, "loss": 0.0586, "num_tokens": 2346682939.0, "reward": 2.5184152126312256, "reward_std": 0.4725482761859894, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1199323758482933, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1067.712158203125, "completions/mean_terminated_length": 800.36083984375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9192903947578712, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11571186625670028, "kl": 0.023834228515625, "learning_rate": 1.178613042603118e-07, "loss": 0.068, "num_tokens": 2347227914.0, "reward": 2.4268975257873535, "reward_std": 0.3438253104686737, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.09953393787145615, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1040.7879638671875, "completions/mean_terminated_length": 851.1007690429688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9195034894251771, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.121292778555046, "kl": 0.024444580078125, "learning_rate": 1.1776804385912288e-07, "loss": 0.0488, "num_tokens": 2347768491.0, "reward": 2.478236675262451, "reward_std": 0.41314926743507385, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10940459370613098, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1045.83935546875, "completions/mean_terminated_length": 772.5227661132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9197165840924831, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11483484926167137, "kl": 0.02471923828125, "learning_rate": 1.1767502266523702e-07, "loss": 0.0825, "num_tokens": 2348309075.0, "reward": 2.458705425262451, "reward_std": 0.3995319902896881, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11055814474821091, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1081.9910888671875, "completions/mean_terminated_length": 789.9418334960938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.919929678759789, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11789900994788009, "kl": 0.025146484375, "learning_rate": 1.1758224073013455e-07, "loss": 0.0575, "num_tokens": 2348872831.0, "reward": 2.321986675262451, "reward_std": 0.46032291650772095, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15641570091247559, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1017.3460083007812, "completions/mean_terminated_length": 758.2429809570312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9201427734270949, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12007469625415519, "kl": 0.02655029296875, "learning_rate": 1.1748969810516305e-07, "loss": 0.0574, "num_tokens": 2349402250.0, "reward": 2.364955425262451, "reward_std": 0.4424870014190674, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11266100406646729, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 883.7701416015625, "completions/mean_terminated_length": 727.5569458007812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.920355868094401, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15157029828532473, "kl": 0.03057861328125, "learning_rate": 1.1739739484153782e-07, "loss": 0.1098, "num_tokens": 2349879603.0, "reward": 2.484375, "reward_std": 0.43522077798843384, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14383503794670105, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1041.3125, "completions/mean_terminated_length": 795.2333374023438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9205689627617069, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13355554414722376, "kl": 0.02606201171875, "learning_rate": 1.1730533099034159e-07, "loss": 0.0798, "num_tokens": 2350412431.0, "reward": 2.4129464626312256, "reward_std": 0.39991295337677, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16516682505607605, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 999.982177734375, "completions/mean_terminated_length": 789.2546997070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9207820574290129, "frac_reward_zero_std": 0.0, "grad_norm": 0.15804772731671182, "kl": 0.027496337890625, "learning_rate": 1.1721350660252484e-07, "loss": 0.0888, "num_tokens": 2350929383.0, "reward": 2.3705358505249023, "reward_std": 0.49632328748703003, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1139446347951889, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1178.1585693359375, "completions/mean_terminated_length": 863.5349731445312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9209951520963188, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11561173890677831, "kl": 0.025634765625, "learning_rate": 1.171219217289051e-07, "loss": 0.0705, "num_tokens": 2351536462.0, "reward": 2.1919643878936768, "reward_std": 0.4486224949359894, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387791991233826, "rewards/tag_count_reward/mean": 0.9508928656578064, "rewards/tag_count_reward/std": 0.17882691323757172, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1071.388427734375, "completions/mean_terminated_length": 808.5609130859375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9212082467636248, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11506240459693362, "kl": 0.024932861328125, "learning_rate": 1.170305764201677e-07, "loss": 0.0467, "num_tokens": 2352084636.0, "reward": 2.396205425262451, "reward_std": 0.4571070969104767, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1662929803133011, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1017.8214721679688, "completions/mean_terminated_length": 823.8090209960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9214213414309307, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14698407357256527, "kl": 0.027099609375, "learning_rate": 1.1693947072686525e-07, "loss": 0.1331, "num_tokens": 2352605356.0, "reward": 2.4854912757873535, "reward_std": 0.5262283682823181, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16962288320064545, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1063.357177734375, "completions/mean_terminated_length": 812.3697509765625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9216344360982366, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11957730266521888, "kl": 0.02593994140625, "learning_rate": 1.1684860469941785e-07, "loss": 0.0543, "num_tokens": 2353149308.0, "reward": 2.4441964626312256, "reward_std": 0.387107789516449, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1136813834309578, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 935.2053833007812, "completions/mean_terminated_length": 742.9424438476562, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9218475307655426, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12937431877539943, "kl": 0.029693603515625, "learning_rate": 1.167579783881128e-07, "loss": 0.0381, "num_tokens": 2353638248.0, "reward": 2.4614956378936768, "reward_std": 0.3544231057167053, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11338312178850174, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 943.71435546875, "completions/mean_terminated_length": 728.7466430664062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9220606254328485, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12490186245087269, "kl": 0.02740478515625, "learning_rate": 1.1666759184310484e-07, "loss": 0.0873, "num_tokens": 2354121480.0, "reward": 2.50390625, "reward_std": 0.331051766872406, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13311463594436646, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1001.107177734375, "completions/mean_terminated_length": 759.5164794921875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9222737201001545, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14375470230631254, "kl": 0.028717041015625, "learning_rate": 1.1657744511441606e-07, "loss": 0.0886, "num_tokens": 2354643128.0, "reward": 2.4363839626312256, "reward_std": 0.4349292516708374, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1421368569135666, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1097.703125, "completions/mean_terminated_length": 912.7119750976562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9224868147674604, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12177590231414014, "kl": 0.0252685546875, "learning_rate": 1.1648753825193577e-07, "loss": 0.0652, "num_tokens": 2355204931.0, "reward": 2.407924175262451, "reward_std": 0.5403696894645691, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9481026530265808, "rewards/tag_count_reward/std": 0.17998811602592468, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 987.4777221679688, "completions/mean_terminated_length": 739.14599609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9226999094347664, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1244516265434499, "kl": 0.02679443359375, "learning_rate": 1.1639787130542042e-07, "loss": 0.0752, "num_tokens": 2355718857.0, "reward": 2.4542412757873535, "reward_std": 0.38402825593948364, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49767759442329407, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765773296356, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1038.0, "completions/mean_terminated_length": 787.6099853515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9229130041020723, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1287583069659273, "kl": 0.024749755859375, "learning_rate": 1.1630844432449396e-07, "loss": 0.0861, "num_tokens": 2356253417.0, "reward": 2.4715402126312256, "reward_std": 0.4852500259876251, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17179030179977417, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1043.4442138671875, "completions/mean_terminated_length": 804.7928466796875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9231260987693783, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1246779542824781, "kl": 0.025909423828125, "learning_rate": 1.1621925735864729e-07, "loss": 0.0749, "num_tokens": 2356790528.0, "reward": 2.459263563156128, "reward_std": 0.4631551504135132, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.15343420207500458, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 917.7232666015625, "completions/mean_terminated_length": 759.5419921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9233391934366842, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12900315545708918, "kl": 0.02825927734375, "learning_rate": 1.1613031045723862e-07, "loss": 0.0738, "num_tokens": 2357271044.0, "reward": 2.486607313156128, "reward_std": 0.31329184770584106, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10517053306102753, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1012.76123046875, "completions/mean_terminated_length": 824.2876586914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9235522881039901, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1429332560598743, "kl": 0.0281982421875, "learning_rate": 1.1604160366949318e-07, "loss": 0.0978, "num_tokens": 2357792905.0, "reward": 2.4073662757873535, "reward_std": 0.37460365891456604, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12758491933345795, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1106.8035888671875, "completions/mean_terminated_length": 832.85302734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9237653827712962, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13513841988564942, "kl": 0.024566650390625, "learning_rate": 1.1595313704450339e-07, "loss": 0.0673, "num_tokens": 2358358865.0, "reward": 2.3275671005249023, "reward_std": 0.3640348017215729, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4925134778022766, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.10996230691671371, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 978.58935546875, "completions/mean_terminated_length": 766.9946899414062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9239784774386021, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16777596381695412, "kl": 0.029052734375, "learning_rate": 1.1586491063122883e-07, "loss": 0.1057, "num_tokens": 2358866601.0, "reward": 2.3978796005249023, "reward_std": 0.4449254870414734, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16232208907604218, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1036.3013916015625, "completions/mean_terminated_length": 813.0108642578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9241915721059081, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12252475126418538, "kl": 0.0250244140625, "learning_rate": 1.1577692447849605e-07, "loss": 0.0621, "num_tokens": 2359397712.0, "reward": 2.400111675262451, "reward_std": 0.45364245772361755, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13485698401927948, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1109.2366943359375, "completions/mean_terminated_length": 818.2748413085938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.924404666773214, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.10873342068510086, "kl": 0.022979736328125, "learning_rate": 1.1568917863499861e-07, "loss": 0.0388, "num_tokens": 2359966026.0, "reward": 2.4285714626312256, "reward_std": 0.35046347975730896, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10818972438573837, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1090.4754638671875, "completions/mean_terminated_length": 782.5988159179688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.92461776144052, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12570502365110353, "kl": 0.0245361328125, "learning_rate": 1.1560167314929714e-07, "loss": 0.1176, "num_tokens": 2360529823.0, "reward": 2.4090402126312256, "reward_std": 0.49671119451522827, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15693362057209015, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1079.9866943359375, "completions/mean_terminated_length": 843.3611450195312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9248308561078259, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1336479273820792, "kl": 0.02447509765625, "learning_rate": 1.155144080698192e-07, "loss": 0.0664, "num_tokens": 2361091177.0, "reward": 2.310826063156128, "reward_std": 0.44756436347961426, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14869897067546844, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 916.6763916015625, "completions/mean_terminated_length": 745.08740234375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9250439507751319, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12520072399968074, "kl": 0.030731201171875, "learning_rate": 1.1542738344485942e-07, "loss": 0.0447, "num_tokens": 2361569544.0, "reward": 2.447544813156128, "reward_std": 0.3588140308856964, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14259286224842072, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 823.8839721679688, "completions/mean_terminated_length": 690.5643310546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9252570454424378, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.15758728483092396, "kl": 0.03179931640625, "learning_rate": 1.1534059932257908e-07, "loss": 0.0499, "num_tokens": 2362001636.0, "reward": 2.6205358505249023, "reward_std": 0.3683219254016876, "rewards/accuracy_reward/mean": 0.7175925970077515, "rewards/accuracy_reward/std": 0.45069241523742676, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.11923423409461975, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1136.8192138671875, "completions/mean_terminated_length": 861.345947265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9254701401097437, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13701561953919866, "kl": 0.023529052734375, "learning_rate": 1.1525405575100678e-07, "loss": 0.0378, "num_tokens": 2362580803.0, "reward": 2.424107313156128, "reward_std": 0.43557584285736084, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1353386640548706, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 952.43310546875, "completions/mean_terminated_length": 769.8385620117188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9256832347770497, "frac_reward_zero_std": 0.25, "grad_norm": 0.12526622000718188, "kl": 0.02789306640625, "learning_rate": 1.151677527780376e-07, "loss": 0.0747, "num_tokens": 2363079669.0, "reward": 2.462611675262451, "reward_std": 0.39819738268852234, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174957811832428, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12109260261058807, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 932.68310546875, "completions/mean_terminated_length": 766.8154296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9258963294443556, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13550302359859, "kl": 0.030517578125, "learning_rate": 1.1508169045143376e-07, "loss": 0.0775, "num_tokens": 2363567431.0, "reward": 2.4207589626312256, "reward_std": 0.4839977025985718, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1573437601327896, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1011.247802734375, "completions/mean_terminated_length": 746.9776000976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9261094241116616, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11051203102371328, "kl": 0.0272216796875, "learning_rate": 1.1499586881882404e-07, "loss": 0.0481, "num_tokens": 2364096358.0, "reward": 2.4285714626312256, "reward_std": 0.42416635155677795, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14480386674404144, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1011.24560546875, "completions/mean_terminated_length": 768.4793701171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9263225187789675, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12206389706978321, "kl": 0.027496337890625, "learning_rate": 1.1491028792770419e-07, "loss": 0.0301, "num_tokens": 2364617636.0, "reward": 2.411830425262451, "reward_std": 0.4586961269378662, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13351379334926605, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1064.34375, "completions/mean_terminated_length": 860.1886596679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9265356134462736, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12648405440577978, "kl": 0.025299072265625, "learning_rate": 1.1482494782543672e-07, "loss": 0.0952, "num_tokens": 2365165374.0, "reward": 2.404017925262451, "reward_std": 0.3988664746284485, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10557335615158081, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 893.9710083007812, "completions/mean_terminated_length": 755.4874877929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9267487081135795, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13711745422304336, "kl": 0.030181884765625, "learning_rate": 1.147398485592508e-07, "loss": 0.1287, "num_tokens": 2365627249.0, "reward": 2.5691964626312256, "reward_std": 0.42172396183013916, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.47764313220977783, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11286582797765732, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1079.2076416015625, "completions/mean_terminated_length": 814.9915161132812, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9269618027808854, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14518643508514753, "kl": 0.0277099609375, "learning_rate": 1.1465499017624243e-07, "loss": 0.1067, "num_tokens": 2366181118.0, "reward": 2.3286831378936768, "reward_std": 0.4309108257293701, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.13648569583892822, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1105.8348388671875, "completions/mean_terminated_length": 878.7755737304688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9271748974481914, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12323064942671946, "kl": 0.024017333984375, "learning_rate": 1.1457037272337414e-07, "loss": 0.1172, "num_tokens": 2366750724.0, "reward": 2.375, "reward_std": 0.523169755935669, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17942628264427185, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 890.6473388671875, "completions/mean_terminated_length": 751.7649536132812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9273879921154973, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.136656650325123, "kl": 0.0301513671875, "learning_rate": 1.1448599624747523e-07, "loss": 0.0549, "num_tokens": 2367215318.0, "reward": 2.541294813156128, "reward_std": 0.37390655279159546, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12805373966693878, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1010.7076416015625, "completions/mean_terminated_length": 788.6314697265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9276010867828033, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12430968068296423, "kl": 0.028839111328125, "learning_rate": 1.144018607952417e-07, "loss": 0.0639, "num_tokens": 2367732739.0, "reward": 2.41796875, "reward_std": 0.4178357720375061, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1199323832988739, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1202.21875, "completions/mean_terminated_length": 906.7047729492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9278141814501092, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11453165564394323, "kl": 0.02557373046875, "learning_rate": 1.143179664132359e-07, "loss": 0.0571, "num_tokens": 2368344981.0, "reward": 2.3777902126312256, "reward_std": 0.503399133682251, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.15795646607875824, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 879.857177734375, "completions/mean_terminated_length": 726.4646606445312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9280272761174152, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12289841008234728, "kl": 0.02880859375, "learning_rate": 1.142343131478872e-07, "loss": 0.0077, "num_tokens": 2368806389.0, "reward": 2.5496652126312256, "reward_std": 0.3370528519153595, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10763297230005264, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1047.247802734375, "completions/mean_terminated_length": 852.4346313476562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9282403707847211, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12407789343582892, "kl": 0.025054931640625, "learning_rate": 1.141509010454911e-07, "loss": 0.0489, "num_tokens": 2369347636.0, "reward": 2.435267925262451, "reward_std": 0.44633668661117554, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.13465432822704315, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 903.6741333007812, "completions/mean_terminated_length": 763.142822265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9284534654520271, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1285686642720892, "kl": 0.029205322265625, "learning_rate": 1.1406773015220993e-07, "loss": 0.0475, "num_tokens": 2369823218.0, "reward": 2.53515625, "reward_std": 0.47579964995384216, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13927440345287323, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 775.872802734375, "completions/mean_terminated_length": 637.32421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.928666560119333, "frac_reward_zero_std": 0.3214285969734192, "grad_norm": 0.1306281199466388, "kl": 0.03546142578125, "learning_rate": 1.1398480051407226e-07, "loss": 0.1102, "num_tokens": 2370232505.0, "reward": 2.640625, "reward_std": 0.30411162972450256, "rewards/accuracy_reward/mean": 0.6941964030265808, "rewards/accuracy_reward/std": 0.4612620174884796, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08709356933832169, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 942.3035888671875, "completions/mean_terminated_length": 777.86669921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9288796547866389, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14897681364637724, "kl": 0.031158447265625, "learning_rate": 1.1390211217697346e-07, "loss": 0.048, "num_tokens": 2370722385.0, "reward": 2.4135046005249023, "reward_std": 0.4109794795513153, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396106719971, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.13851940631866455, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1152.40185546875, "completions/mean_terminated_length": 920.955078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9290927494539449, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11341616821391098, "kl": 0.025787353515625, "learning_rate": 1.138196651866751e-07, "loss": 0.0688, "num_tokens": 2371311893.0, "reward": 2.4229912757873535, "reward_std": 0.4458398222923279, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15822990238666534, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1001.9152221679688, "completions/mean_terminated_length": 827.5677490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9293058441212508, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13453878503123481, "kl": 0.02850341796875, "learning_rate": 1.1373745958880523e-07, "loss": 0.0722, "num_tokens": 2371826959.0, "reward": 2.540736675262451, "reward_std": 0.45150238275527954, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.15279823541641235, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 911.5469360351562, "completions/mean_terminated_length": 729.0077514648438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9295189387885568, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13133984613212665, "kl": 0.027862548828125, "learning_rate": 1.1365549542885853e-07, "loss": 0.0907, "num_tokens": 2372303140.0, "reward": 2.3989956378936768, "reward_std": 0.38063284754753113, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10439462959766388, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1037.212158203125, "completions/mean_terminated_length": 827.4258422851562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9297320334558627, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11123847437447203, "kl": 0.026214599609375, "learning_rate": 1.1357377275219578e-07, "loss": 0.0719, "num_tokens": 2372841523.0, "reward": 2.4146206378936768, "reward_std": 0.3301648199558258, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962118953466415, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 911.9129638671875, "completions/mean_terminated_length": 732.8397827148438, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.9299451281231688, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16768318350005512, "kl": 0.031524658203125, "learning_rate": 1.1349229160404416e-07, "loss": 0.0652, "num_tokens": 2373325260.0, "reward": 2.5245537757873535, "reward_std": 0.46250128746032715, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295229911804, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12782442569732666, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 840.8058471679688, "completions/mean_terminated_length": 661.2743530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9301582227904747, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14941438538884247, "kl": 0.033905029296875, "learning_rate": 1.1341105202949734e-07, "loss": 0.076, "num_tokens": 2373766085.0, "reward": 2.4799108505249023, "reward_std": 0.37186330556869507, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.12569716572761536, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1156.5692138671875, "completions/mean_terminated_length": 913.4517211914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9303713174577806, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11774759243800054, "kl": 0.023162841796875, "learning_rate": 1.1333005407351516e-07, "loss": 0.0731, "num_tokens": 2374355508.0, "reward": 2.314732313156128, "reward_std": 0.42696940898895264, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1373893767595291, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 926.544677734375, "completions/mean_terminated_length": 776.0708618164062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9305844121250866, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1285513172212822, "kl": 0.025482177734375, "learning_rate": 1.1324929778092393e-07, "loss": 0.0782, "num_tokens": 2374833352.0, "reward": 2.4497768878936768, "reward_std": 0.44200292229652405, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13607001304626465, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 999.6808471679688, "completions/mean_terminated_length": 808.8258666992188, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9307975067923925, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10451063452429236, "kl": 0.025665283203125, "learning_rate": 1.1316878319641586e-07, "loss": 0.0607, "num_tokens": 2375353929.0, "reward": 2.513951063156128, "reward_std": 0.3639531433582306, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.08475254476070404, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1015.7410888671875, "completions/mean_terminated_length": 781.0082397460938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9310106014596985, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1305959057171984, "kl": 0.026397705078125, "learning_rate": 1.1308851036454973e-07, "loss": 0.0845, "num_tokens": 2375880357.0, "reward": 2.4229912757873535, "reward_std": 0.3985334038734436, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1406535804271698, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1042.0491943359375, "completions/mean_terminated_length": 764.05126953125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9312236961270044, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 1.2971856541226816, "kl": 0.035675048828125, "learning_rate": 1.1300847932975042e-07, "loss": 0.0341, "num_tokens": 2376419739.0, "reward": 2.2176339626312256, "reward_std": 0.36447110772132874, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.4708675146102905, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12758491933345795, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1001.2433471679688, "completions/mean_terminated_length": 820.3900756835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9314367907943104, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1578556177760365, "kl": 0.027587890625, "learning_rate": 1.1292869013630895e-07, "loss": 0.0934, "num_tokens": 2376944280.0, "reward": 2.3989956378936768, "reward_std": 0.4837868809700012, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1552460640668869, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1080.296875, "completions/mean_terminated_length": 876.2946166992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9316498854616163, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.132885951101939, "kl": 0.026763916015625, "learning_rate": 1.128491428283825e-07, "loss": 0.1011, "num_tokens": 2377492509.0, "reward": 2.4068081378936768, "reward_std": 0.5362050533294678, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.170974463224411, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1153.212158203125, "completions/mean_terminated_length": 892.7694091796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9318629801289223, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11481821301552803, "kl": 0.023223876953125, "learning_rate": 1.1276983744999442e-07, "loss": 0.053, "num_tokens": 2378080108.0, "reward": 2.322544813156128, "reward_std": 0.35287967324256897, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11345604062080383, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 941.6116333007812, "completions/mean_terminated_length": 750.4555053710938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9320760747962282, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12129912300372475, "kl": 0.026214599609375, "learning_rate": 1.1269077404503432e-07, "loss": 0.0768, "num_tokens": 2378579518.0, "reward": 2.4760046005249023, "reward_std": 0.4287765920162201, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.4976775646209717, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11919102817773819, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 986.77685546875, "completions/mean_terminated_length": 752.5558471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9322891694635341, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13361245328437274, "kl": 0.028350830078125, "learning_rate": 1.1261195265725756e-07, "loss": 0.0368, "num_tokens": 2379089434.0, "reward": 2.4575893878936768, "reward_std": 0.36607611179351807, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.10992966592311859, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1007.0469360351562, "completions/mean_terminated_length": 820.7711181640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.9325022641308401, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13008563735391598, "kl": 0.02606201171875, "learning_rate": 1.125333733302857e-07, "loss": 0.0648, "num_tokens": 2379609711.0, "reward": 2.3895089626312256, "reward_std": 0.4988686144351959, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854744791984558, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16375111043453217, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1101.2545166015625, "completions/mean_terminated_length": 836.1657104492188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.932715358798146, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12968861329862844, "kl": 0.025177001953125, "learning_rate": 1.1245503610760662e-07, "loss": 0.1161, "num_tokens": 2380179105.0, "reward": 2.3777902126312256, "reward_std": 0.4191914200782776, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13335825502872467, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 877.2567138671875, "completions/mean_terminated_length": 706.585693359375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9329284534654521, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13647455532216726, "kl": 0.03204345703125, "learning_rate": 1.1237694103257375e-07, "loss": 0.1044, "num_tokens": 2380636100.0, "reward": 2.6450893878936768, "reward_std": 0.4329272508621216, "rewards/accuracy_reward/mean": 0.7410714030265808, "rewards/accuracy_reward/std": 0.43853598833084106, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12891364097595215, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 909.6495971679688, "completions/mean_terminated_length": 698.8438720703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.933141548132758, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.214846058861817, "kl": 0.0323486328125, "learning_rate": 1.1229908814840692e-07, "loss": 0.0847, "num_tokens": 2381114215.0, "reward": 2.3582589626312256, "reward_std": 0.43972936272621155, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9542410969734192, "rewards/tag_count_reward/std": 0.16929872334003448, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 929.1406860351562, "completions/mean_terminated_length": 714.8909301757812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.933354642800064, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14589530696516964, "kl": 0.029815673828125, "learning_rate": 1.1222147749819157e-07, "loss": 0.0477, "num_tokens": 2381605382.0, "reward": 2.411830425262451, "reward_std": 0.4584210216999054, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1314026117324829, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 962.5357666015625, "completions/mean_terminated_length": 754.6808471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9335677374673699, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13908846066914746, "kl": 0.028228759765625, "learning_rate": 1.1214410912487935e-07, "loss": 0.1204, "num_tokens": 2382109718.0, "reward": 2.4503350257873535, "reward_std": 0.4439104199409485, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1427011638879776, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 934.3527221679688, "completions/mean_terminated_length": 752.1194458007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9337808321346758, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12124389500192245, "kl": 0.027587890625, "learning_rate": 1.1206698307128779e-07, "loss": 0.0604, "num_tokens": 2382595908.0, "reward": 2.453125, "reward_std": 0.33934077620506287, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08547309041023254, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 893.07373046875, "completions/mean_terminated_length": 711.031005859375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9339939268019818, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11985472897366503, "kl": 0.02740478515625, "learning_rate": 1.119900993801001e-07, "loss": -0.0122, "num_tokens": 2383065029.0, "reward": 2.5541296005249023, "reward_std": 0.287217915058136, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1616371124982834, "rewards/tag_count_reward/mean": 0.9871651530265808, "rewards/tag_count_reward/std": 0.08991295844316483, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 979.8438110351562, "completions/mean_terminated_length": 775.3031616210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9342070214692877, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12333641176694152, "kl": 0.03045654296875, "learning_rate": 1.1191345809386565e-07, "loss": 0.0322, "num_tokens": 2383572351.0, "reward": 2.3989956378936768, "reward_std": 0.35006973147392273, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.09933306276798248, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 846.0469360351562, "completions/mean_terminated_length": 731.4352416992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9344201161365937, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1414336794581791, "kl": 0.03057861328125, "learning_rate": 1.1183705925499947e-07, "loss": 0.0607, "num_tokens": 2384014212.0, "reward": 2.584263563156128, "reward_std": 0.35953983664512634, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1208655834197998, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 926.7991333007812, "completions/mean_terminated_length": 736.5169677734375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9346332108038996, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12186933627002564, "kl": 0.029083251953125, "learning_rate": 1.1176090290578244e-07, "loss": 0.0515, "num_tokens": 2384501162.0, "reward": 2.564174175262451, "reward_std": 0.39993947744369507, "rewards/accuracy_reward/mean": 0.6495535969734192, "rewards/accuracy_reward/std": 0.4776431620121002, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12015076726675034, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1020.2344360351562, "completions/mean_terminated_length": 789.9699096679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9348463054712056, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11369895684751384, "kl": 0.028076171875, "learning_rate": 1.1168498908836136e-07, "loss": 0.0419, "num_tokens": 2385024467.0, "reward": 2.458705425262451, "reward_std": 0.3179273009300232, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.09750170260667801, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1100.2567138671875, "completions/mean_terminated_length": 845.1983032226562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9350594001385115, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1360018368592396, "kl": 0.0242919921875, "learning_rate": 1.1160931784474858e-07, "loss": 0.1064, "num_tokens": 2385588278.0, "reward": 2.27734375, "reward_std": 0.4707585871219635, "rewards/accuracy_reward/mean": 0.41203704476356506, "rewards/accuracy_reward/std": 0.4927724003791809, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16933098435401917, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 879.55810546875, "completions/mean_terminated_length": 698.87109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9352724948058175, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.13263314476248234, "kl": 0.034912109375, "learning_rate": 1.1153388921682253e-07, "loss": 0.0084, "num_tokens": 2386050160.0, "reward": 2.4112725257873535, "reward_std": 0.3310636878013611, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10811903327703476, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 876.7031860351562, "completions/mean_terminated_length": 729.5552978515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9354855894731234, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13724809157990736, "kl": 0.029754638671875, "learning_rate": 1.1145870324632704e-07, "loss": 0.0237, "num_tokens": 2386510299.0, "reward": 2.4174108505249023, "reward_std": 0.2807876467704773, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9665178656578064, "rewards/format_reward/std": 0.1800929754972458, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08547309041023254, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1043.0179443359375, "completions/mean_terminated_length": 824.5435180664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9356986841404293, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13280877350636944, "kl": 0.0255126953125, "learning_rate": 1.1138375997487187e-07, "loss": 0.0904, "num_tokens": 2387046835.0, "reward": 2.4017858505249023, "reward_std": 0.4428028166294098, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09910815209150314, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1039.74560546875, "completions/mean_terminated_length": 830.4851684570312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9359117788077354, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12571804545203322, "kl": 0.025238037109375, "learning_rate": 1.1130905944393228e-07, "loss": 0.0243, "num_tokens": 2387585841.0, "reward": 2.3465402126312256, "reward_std": 0.3728891909122467, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12035840004682541, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 908.107177734375, "completions/mean_terminated_length": 768.1203002929688, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9361248734750413, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13581340330210478, "kl": 0.0308837890625, "learning_rate": 1.112346016948494e-07, "loss": 0.1104, "num_tokens": 2388052401.0, "reward": 2.5089287757873535, "reward_std": 0.4902397692203522, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48466411232948303, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14140157401561737, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1066.4285888671875, "completions/mean_terminated_length": 819.664794921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9363379681423473, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.9904674160188459, "kl": 0.028839111328125, "learning_rate": 1.1116038676882983e-07, "loss": 0.0221, "num_tokens": 2388606625.0, "reward": 2.4246652126312256, "reward_std": 0.4120630621910095, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1131737232208252, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 852.1920166015625, "completions/mean_terminated_length": 731.729736328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9365510628096532, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1392593356652518, "kl": 0.031494140625, "learning_rate": 1.1108641470694582e-07, "loss": 0.0252, "num_tokens": 2389053943.0, "reward": 2.474888563156128, "reward_std": 0.369652658700943, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10304656624794006, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 966.310302734375, "completions/mean_terminated_length": 805.443603515625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9367641574769592, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1716921415544946, "kl": 0.031768798828125, "learning_rate": 1.110126855501354e-07, "loss": 0.0666, "num_tokens": 2389556482.0, "reward": 2.3013393878936768, "reward_std": 0.4197487235069275, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13888955116271973, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1038.071533203125, "completions/mean_terminated_length": 841.4719848632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9369772521442651, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11352532306758972, "kl": 0.025604248046875, "learning_rate": 1.1093919933920187e-07, "loss": 0.0462, "num_tokens": 2390091314.0, "reward": 2.377232313156128, "reward_std": 0.4040103852748871, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.10383255779743195, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1136.3348388671875, "completions/mean_terminated_length": 881.0685424804688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9371903468115711, "frac_reward_zero_std": 0.0, "grad_norm": 0.1310437393972883, "kl": 0.026153564453125, "learning_rate": 1.1086595611481425e-07, "loss": 0.0964, "num_tokens": 2390676120.0, "reward": 2.3643975257873535, "reward_std": 0.5020871758460999, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15332838892936707, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 883.1339721679688, "completions/mean_terminated_length": 685.4412841796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.937403441478877, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14230780157464434, "kl": 0.0322265625, "learning_rate": 1.1079295591750711e-07, "loss": 0.0671, "num_tokens": 2391136580.0, "reward": 2.506138563156128, "reward_std": 0.40291139483451843, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10073082149028778, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 891.2366333007812, "completions/mean_terminated_length": 701.9480590820312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9376165361461829, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1253766819664411, "kl": 0.0325927734375, "learning_rate": 1.1072019878768041e-07, "loss": 0.0551, "num_tokens": 2391606270.0, "reward": 2.59765625, "reward_std": 0.36383917927742004, "rewards/accuracy_reward/mean": 0.6830357313156128, "rewards/accuracy_reward/std": 0.4658135175704956, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13669590651988983, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1183.1741943359375, "completions/mean_terminated_length": 841.0155639648438, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9378296308134889, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12394164739988968, "kl": 0.022552490234375, "learning_rate": 1.1064768476559973e-07, "loss": 0.0876, "num_tokens": 2392204556.0, "reward": 2.3236608505249023, "reward_std": 0.4416102170944214, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13686132431030273, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1059.7388916015625, "completions/mean_terminated_length": 851.4027099609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9380427254807948, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13051586307945492, "kl": 0.02874755859375, "learning_rate": 1.105754138913959e-07, "loss": 0.0677, "num_tokens": 2392745399.0, "reward": 2.498326063156128, "reward_std": 0.43339961767196655, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1399807333946228, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 991.0491333007812, "completions/mean_terminated_length": 785.2960205078125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9382558201481008, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12506374457362188, "kl": 0.02874755859375, "learning_rate": 1.1050338620506544e-07, "loss": 0.0658, "num_tokens": 2393257549.0, "reward": 2.5424108505249023, "reward_std": 0.4021592438220978, "rewards/accuracy_reward/mean": 0.6450892686843872, "rewards/accuracy_reward/std": 0.4790211617946625, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.14942027628421783, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 822.1094360351562, "completions/mean_terminated_length": 718.2203979492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9384689148154067, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14623618122121235, "kl": 0.03240966796875, "learning_rate": 1.104316017464701e-07, "loss": 0.0674, "num_tokens": 2393697838.0, "reward": 2.646205425262451, "reward_std": 0.3961551785469055, "rewards/accuracy_reward/mean": 0.7410714030265808, "rewards/accuracy_reward/std": 0.43853598833084106, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12473505735397339, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 940.85498046875, "completions/mean_terminated_length": 746.1600952148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9386820094827127, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13314206068965498, "kl": 0.028472900390625, "learning_rate": 1.1036006055533711e-07, "loss": 0.0973, "num_tokens": 2394181149.0, "reward": 2.39453125, "reward_std": 0.4445429742336273, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1372518539428711, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 911.8281860351562, "completions/mean_terminated_length": 679.7069702148438, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9388951041500186, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15228734164723715, "kl": 0.027801513671875, "learning_rate": 1.1028876267125905e-07, "loss": 0.1019, "num_tokens": 2394657616.0, "reward": 2.396763563156128, "reward_std": 0.3983593285083771, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.1276179403066635, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 862.0670166015625, "completions/mean_terminated_length": 706.33837890625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9391081988173245, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12942239639744188, "kl": 0.029937744140625, "learning_rate": 1.1021770813369378e-07, "loss": 0.0699, "num_tokens": 2395121918.0, "reward": 2.513951063156128, "reward_std": 0.3371692895889282, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1240563914179802, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 973.6272583007812, "completions/mean_terminated_length": 743.6124877929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9393212934846306, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12884700790264783, "kl": 0.026580810546875, "learning_rate": 1.1014689698196463e-07, "loss": 0.085, "num_tokens": 2395635239.0, "reward": 2.4620537757873535, "reward_std": 0.3684464395046234, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967316269874573, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 907.0692138671875, "completions/mean_terminated_length": 716.9140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9395343881519365, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1451302704816034, "kl": 0.02947998046875, "learning_rate": 1.1007632925526012e-07, "loss": 0.1251, "num_tokens": 2396108678.0, "reward": 2.6082589626312256, "reward_std": 0.45828431844711304, "rewards/accuracy_reward/mean": 0.7075892686843872, "rewards/accuracy_reward/std": 0.4553784728050232, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12758491933345795, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 938.6004638671875, "completions/mean_terminated_length": 760.40673828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9397474828192425, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13217111284181304, "kl": 0.0284423828125, "learning_rate": 1.1000600499263406e-07, "loss": 0.0485, "num_tokens": 2396593875.0, "reward": 2.517857313156128, "reward_std": 0.42520764470100403, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10013572871685028, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1068.1629638671875, "completions/mean_terminated_length": 821.8351440429688, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.9399605774865484, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14822661303540743, "kl": 0.027587890625, "learning_rate": 1.0993592423300561e-07, "loss": 0.0781, "num_tokens": 2397139148.0, "reward": 2.3978796005249023, "reward_std": 0.43711966276168823, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.14151519536972046, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1006.8504638671875, "completions/mean_terminated_length": 807.4813842773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9401736721538544, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1354165066113595, "kl": 0.026153564453125, "learning_rate": 1.0986608701515907e-07, "loss": 0.1085, "num_tokens": 2397652617.0, "reward": 2.4838171005249023, "reward_std": 0.4179857671260834, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.1442066878080368, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 961.60498046875, "completions/mean_terminated_length": 767.1973876953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9403867668211603, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12511375976341238, "kl": 0.029571533203125, "learning_rate": 1.0979649337774394e-07, "loss": 0.0594, "num_tokens": 2398149816.0, "reward": 2.4732143878936768, "reward_std": 0.4018925428390503, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.11992326378822327, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 987.8281860351562, "completions/mean_terminated_length": 739.5785522460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9405998614884663, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13422537765455228, "kl": 0.027008056640625, "learning_rate": 1.0972714335927498e-07, "loss": 0.1033, "num_tokens": 2398659723.0, "reward": 2.3738839626312256, "reward_std": 0.3860425055027008, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.1421368569135666, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 950.71435546875, "completions/mean_terminated_length": 764.4908447265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9408129561557722, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1313605382830849, "kl": 0.02862548828125, "learning_rate": 1.0965803699813223e-07, "loss": 0.103, "num_tokens": 2399148235.0, "reward": 2.521763563156128, "reward_std": 0.43817147612571716, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12086557596921921, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 909.919677734375, "completions/mean_terminated_length": 782.8386840820312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9410260508230781, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13797339735679653, "kl": 0.029388427734375, "learning_rate": 1.0958917433256066e-07, "loss": 0.0568, "num_tokens": 2399627815.0, "reward": 2.5390625, "reward_std": 0.42338475584983826, "rewards/accuracy_reward/mean": 0.6316964030265808, "rewards/accuracy_reward/std": 0.4828835427761078, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13455694913864136, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1031.4107666015625, "completions/mean_terminated_length": 839.95751953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9412391454903841, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13067279555446829, "kl": 0.02642822265625, "learning_rate": 1.0952055540067057e-07, "loss": 0.1179, "num_tokens": 2400162511.0, "reward": 2.4190850257873535, "reward_std": 0.3631475269794464, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.1057254895567894, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 930.4219360351562, "completions/mean_terminated_length": 733.8923950195312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.94145224015769, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1276741451119799, "kl": 0.028289794921875, "learning_rate": 1.0945218024043721e-07, "loss": 0.0126, "num_tokens": 2400645708.0, "reward": 2.4916296005249023, "reward_std": 0.4471625089645386, "rewards/accuracy_reward/mean": 0.5870535969734192, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.12517838180065155, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 874.1094360351562, "completions/mean_terminated_length": 696.0642700195312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.941665334824996, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13604941594691422, "kl": 0.030029296875, "learning_rate": 1.0938404888970096e-07, "loss": 0.0312, "num_tokens": 2401105997.0, "reward": 2.4754464626312256, "reward_std": 0.40643665194511414, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12337145954370499, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1001.7344360351562, "completions/mean_terminated_length": 811.2533569335938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9418784294923019, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12908360608501396, "kl": 0.02691650390625, "learning_rate": 1.0931616138616751e-07, "loss": 0.0902, "num_tokens": 2401625094.0, "reward": 2.3895089626312256, "reward_std": 0.4120877683162689, "rewards/accuracy_reward/mean": 0.5069444179534912, "rewards/accuracy_reward/std": 0.5005314350128174, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.14601868391036987, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1046.0625, "completions/mean_terminated_length": 828.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.942091524159608, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14533836784691892, "kl": 0.02716064453125, "learning_rate": 1.0924851776740713e-07, "loss": 0.138, "num_tokens": 2402162786.0, "reward": 2.435826063156128, "reward_std": 0.5159264802932739, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.16543777287006378, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 939.8035888671875, "completions/mean_terminated_length": 713.3978271484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9423046188269139, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12571940291886324, "kl": 0.028778076171875, "learning_rate": 1.0918111807085558e-07, "loss": 0.0599, "num_tokens": 2402653130.0, "reward": 2.4877233505249023, "reward_std": 0.3626733124256134, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.12103716284036636, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 961.7701416015625, "completions/mean_terminated_length": 793.79638671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9425177134942198, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13940454937265723, "kl": 0.029876708984375, "learning_rate": 1.0911396233381338e-07, "loss": 0.0804, "num_tokens": 2403151843.0, "reward": 2.5005581378936768, "reward_std": 0.4199605882167816, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4889315068721771, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1454136222600937, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 850.8058471679688, "completions/mean_terminated_length": 726.9581298828125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9427308081615258, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1336195718897701, "kl": 0.03363037109375, "learning_rate": 1.0904705059344605e-07, "loss": 0.042, "num_tokens": 2403601740.0, "reward": 2.6138393878936768, "reward_std": 0.32564234733581543, "rewards/accuracy_reward/mean": 0.6852678656578064, "rewards/accuracy_reward/std": 0.4649282693862915, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9910714030265808, "rewards/tag_count_reward/std": 0.05962758511304855, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 893.7188110351562, "completions/mean_terminated_length": 732.1781005859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9429439028288317, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14153226418602613, "kl": 0.031494140625, "learning_rate": 1.0898038288678416e-07, "loss": 0.052, "num_tokens": 2404065374.0, "reward": 2.4933037757873535, "reward_std": 0.4588598310947418, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1262323260307312, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 940.013427734375, "completions/mean_terminated_length": 745.1705932617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9431569974961377, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1173043467412791, "kl": 0.026397705078125, "learning_rate": 1.0891395925072313e-07, "loss": 0.0666, "num_tokens": 2404554628.0, "reward": 2.5396206378936768, "reward_std": 0.36521852016448975, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.083281509578228, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 836.5960083007812, "completions/mean_terminated_length": 704.660888671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9433700921634436, "frac_reward_zero_std": 0.25, "grad_norm": 0.14746680020960345, "kl": 0.03509521484375, "learning_rate": 1.0884777972202346e-07, "loss": 0.0899, "num_tokens": 2404992703.0, "reward": 2.482142925262451, "reward_std": 0.3794863820075989, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14426834881305695, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1055.919677734375, "completions/mean_terminated_length": 813.4111328125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9435831868307496, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12311732086695507, "kl": 0.02490234375, "learning_rate": 1.0878184433731039e-07, "loss": 0.0453, "num_tokens": 2405543867.0, "reward": 2.4760046005249023, "reward_std": 0.4557977616786957, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12035840004682541, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1018.669677734375, "completions/mean_terminated_length": 811.6997680664062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9437962814980555, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12239655321029592, "kl": 0.026123046875, "learning_rate": 1.0871615313307402e-07, "loss": 0.0743, "num_tokens": 2406068583.0, "reward": 2.5122768878936768, "reward_std": 0.412568598985672, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11055814474821091, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 969.0491333007812, "completions/mean_terminated_length": 798.98193359375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9440093761653615, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12931178408718322, "kl": 0.029632568359375, "learning_rate": 1.0865070614566955e-07, "loss": 0.034, "num_tokens": 2406567677.0, "reward": 2.486049175262451, "reward_std": 0.4383724629878998, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12715734541416168, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1059.5357666015625, "completions/mean_terminated_length": 870.2553100585938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9442224708326674, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11520428219047466, "kl": 0.026580810546875, "learning_rate": 1.0858550341131687e-07, "loss": 0.0438, "num_tokens": 2407108749.0, "reward": 2.467076063156128, "reward_std": 0.4512147903442383, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12338031828403473, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1065.5045166015625, "completions/mean_terminated_length": 790.4057006835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9444355654999733, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.13900965080550057, "kl": 0.029632568359375, "learning_rate": 1.085205449661006e-07, "loss": 0.0441, "num_tokens": 2407658623.0, "reward": 2.415736675262451, "reward_std": 0.3632409870624542, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.126290425658226, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 906.716552734375, "completions/mean_terminated_length": 740.3401489257812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9446486601672793, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1360016041036316, "kl": 0.029266357421875, "learning_rate": 1.0845583084597026e-07, "loss": 0.0969, "num_tokens": 2408131488.0, "reward": 2.446986675262451, "reward_std": 0.37030893564224243, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9871651530265808, "rewards/tag_count_reward/std": 0.08674707263708115, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 992.1920166015625, "completions/mean_terminated_length": 809.77490234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9448617548345852, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1392144043731186, "kl": 0.027862548828125, "learning_rate": 1.0839136108674032e-07, "loss": 0.0802, "num_tokens": 2408642134.0, "reward": 2.4056921005249023, "reward_std": 0.4773463010787964, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.17171035706996918, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 947.4866333007812, "completions/mean_terminated_length": 733.2532958984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9450748495018912, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14749550931281832, "kl": 0.032012939453125, "learning_rate": 1.0832713572408976e-07, "loss": 0.0854, "num_tokens": 2409136000.0, "reward": 2.540736675262451, "reward_std": 0.37744495272636414, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.10893574357032776, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 985.9420166015625, "completions/mean_terminated_length": 762.0486450195312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9452879441691971, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12653013762864193, "kl": 0.027740478515625, "learning_rate": 1.0826315479356235e-07, "loss": 0.0493, "num_tokens": 2409645814.0, "reward": 2.4771206378936768, "reward_std": 0.42411619424819946, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12540756165981293, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1097.0804443359375, "completions/mean_terminated_length": 834.2905883789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9455010388365032, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1269239399175205, "kl": 0.025299072265625, "learning_rate": 1.081994183305667e-07, "loss": 0.0905, "num_tokens": 2410203690.0, "reward": 2.3911831378936768, "reward_std": 0.5035035014152527, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.16515076160430908, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1023.79248046875, "completions/mean_terminated_length": 780.472412109375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9457141335038091, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1223253508124097, "kl": 0.02630615234375, "learning_rate": 1.08135926370376e-07, "loss": 0.0645, "num_tokens": 2410733741.0, "reward": 2.4246652126312256, "reward_std": 0.3663933575153351, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 982.0647583007812, "completions/mean_terminated_length": 771.1577758789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.945927228171115, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12817284210064, "kl": 0.02850341796875, "learning_rate": 1.0807267894812834e-07, "loss": 0.1035, "num_tokens": 2411237898.0, "reward": 2.5, "reward_std": 0.4459177851676941, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.1262323409318924, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 897.3527221679688, "completions/mean_terminated_length": 712.5336303710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.946140322838421, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13932298455989758, "kl": 0.029632568359375, "learning_rate": 1.080096760988262e-07, "loss": 0.0462, "num_tokens": 2411715304.0, "reward": 2.4291296005249023, "reward_std": 0.40668851137161255, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.135438933968544, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1097.1429443359375, "completions/mean_terminated_length": 858.1005249023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9463534175057269, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13866983919380696, "kl": 0.02532958984375, "learning_rate": 1.0794691785733684e-07, "loss": 0.0617, "num_tokens": 2412275272.0, "reward": 2.34765625, "reward_std": 0.5042898058891296, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1645980179309845, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1002.7813110351562, "completions/mean_terminated_length": 740.0167236328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9465665121730329, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1415494046134373, "kl": 0.027496337890625, "learning_rate": 1.0788440425839215e-07, "loss": 0.0605, "num_tokens": 2412789926.0, "reward": 2.407924175262451, "reward_std": 0.46174025535583496, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.12665565311908722, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1046.875, "completions/mean_terminated_length": 784.6083984375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9467796068403388, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12110601505399457, "kl": 0.025726318359375, "learning_rate": 1.0782213533658867e-07, "loss": 0.0699, "num_tokens": 2413324542.0, "reward": 2.2371652126312256, "reward_std": 0.4158909022808075, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15604011714458466, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 951.79248046875, "completions/mean_terminated_length": 759.02099609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9469927015076448, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11757115916704247, "kl": 0.027740478515625, "learning_rate": 1.0776011112638747e-07, "loss": 0.0267, "num_tokens": 2413814769.0, "reward": 2.494419813156128, "reward_std": 0.4215051829814911, "rewards/accuracy_reward/mean": 0.6157407164573669, "rewards/accuracy_reward/std": 0.48698362708091736, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399910926818848, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1048.1585693359375, "completions/mean_terminated_length": 803.7528076171875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9472057961749507, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12401392856837996, "kl": 0.027130126953125, "learning_rate": 1.0769833166211414e-07, "loss": 0.0911, "num_tokens": 2414349592.0, "reward": 2.4838171005249023, "reward_std": 0.46600082516670227, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489606618881226, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16142748296260834, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 932.9241333007812, "completions/mean_terminated_length": 767.0923461914062, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.9474188908422567, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12514265046791356, "kl": 0.030120849609375, "learning_rate": 1.0763679697795899e-07, "loss": 0.0829, "num_tokens": 2414832646.0, "reward": 2.5228796005249023, "reward_std": 0.41759487986564636, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1463720053434372, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1076.19873046875, "completions/mean_terminated_length": 880.7962646484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9476319855095626, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1266694755001441, "kl": 0.025390625, "learning_rate": 1.0757550710797668e-07, "loss": 0.0861, "num_tokens": 2415388879.0, "reward": 2.4654018878936768, "reward_std": 0.44824090600013733, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.13444557785987854, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 961.404052734375, "completions/mean_terminated_length": 780.3046875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9478450801768685, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1337750738774125, "kl": 0.030426025390625, "learning_rate": 1.0751446208608642e-07, "loss": 0.0762, "num_tokens": 2415886628.0, "reward": 2.4854912757873535, "reward_std": 0.450486421585083, "rewards/accuracy_reward/mean": 0.6183035969734192, "rewards/accuracy_reward/std": 0.4863457977771759, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15645259618759155, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 963.3638916015625, "completions/mean_terminated_length": 785.8779296875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9480581748441745, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12293067983244339, "kl": 0.027679443359375, "learning_rate": 1.0745366194607203e-07, "loss": 0.0465, "num_tokens": 2416383463.0, "reward": 2.493861675262451, "reward_std": 0.4035743176937103, "rewards/accuracy_reward/mean": 0.5647321343421936, "rewards/accuracy_reward/std": 0.49634629487991333, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12892211973667145, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 896.24560546875, "completions/mean_terminated_length": 777.0985107421875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9482712695114804, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13178409186063167, "kl": 0.030029296875, "learning_rate": 1.0739310672158174e-07, "loss": 0.0473, "num_tokens": 2416858341.0, "reward": 2.5234375, "reward_std": 0.4131523370742798, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.107582226395607, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 904.5156860351562, "completions/mean_terminated_length": 731.082275390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9484843641787865, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12929407369907378, "kl": 0.0291748046875, "learning_rate": 1.0733279644612822e-07, "loss": 0.0614, "num_tokens": 2417332060.0, "reward": 2.4659600257873535, "reward_std": 0.3387841284275055, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.11734377592802048, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 986.6473388671875, "completions/mean_terminated_length": 734.5028076171875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9486974588460924, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1429625872946558, "kl": 0.027313232421875, "learning_rate": 1.072727311530886e-07, "loss": 0.1122, "num_tokens": 2417850686.0, "reward": 2.36328125, "reward_std": 0.46211400628089905, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12493880838155746, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1048.9554443359375, "completions/mean_terminated_length": 811.61328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9489105535133984, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11739460857062924, "kl": 0.027130126953125, "learning_rate": 1.0721291087570435e-07, "loss": 0.0429, "num_tokens": 2418388474.0, "reward": 2.3839287757873535, "reward_std": 0.3981561064720154, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14187753200531006, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1015.544677734375, "completions/mean_terminated_length": 787.6730346679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9491236481807043, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12153895420205191, "kl": 0.02630615234375, "learning_rate": 1.0715333564708152e-07, "loss": 0.0686, "num_tokens": 2418912398.0, "reward": 2.4380581378936768, "reward_std": 0.3482321500778198, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1046096533536911, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1036.857177734375, "completions/mean_terminated_length": 840.0213012695312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9493367428480102, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11971476566126323, "kl": 0.028228759765625, "learning_rate": 1.0709400550019032e-07, "loss": 0.0821, "num_tokens": 2419444958.0, "reward": 2.4620537757873535, "reward_std": 0.41845688223838806, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15274205803871155, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 913.216552734375, "completions/mean_terminated_length": 717.1544799804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9495498375153162, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.139177705039715, "kl": 0.029571533203125, "learning_rate": 1.0703492046786555e-07, "loss": 0.1246, "num_tokens": 2419915903.0, "reward": 2.5033483505249023, "reward_std": 0.3002256155014038, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13399912416934967, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 948.2567138671875, "completions/mean_terminated_length": 727.1287231445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9497629321826221, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1556627199468159, "kl": 0.029296875, "learning_rate": 1.0697608058280621e-07, "loss": 0.1149, "num_tokens": 2420415298.0, "reward": 2.4458706378936768, "reward_std": 0.4268389940261841, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14706972241401672, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 951.2076416015625, "completions/mean_terminated_length": 768.4088745117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9499760268499281, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12180670371396651, "kl": 0.03057861328125, "learning_rate": 1.0691748587757567e-07, "loss": 0.0728, "num_tokens": 2420913279.0, "reward": 2.4402902126312256, "reward_std": 0.3962782025337219, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11638234555721283, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 954.1964721679688, "completions/mean_terminated_length": 775.2103881835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.950189121517234, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1503474504749525, "kl": 0.028472900390625, "learning_rate": 1.0685913638460168e-07, "loss": 0.0674, "num_tokens": 2421417159.0, "reward": 2.4151787757873535, "reward_std": 0.44619113206863403, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15324795246124268, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1013.57373046875, "completions/mean_terminated_length": 802.2392578125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.95040221618454, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12767281938620414, "kl": 0.027099609375, "learning_rate": 1.0680103213617606e-07, "loss": 0.0864, "num_tokens": 2421941384.0, "reward": 2.435267925262451, "reward_std": 0.4095615744590759, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11036036163568497, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1029.5670166015625, "completions/mean_terminated_length": 801.3934326171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9506153108518459, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12178717088203075, "kl": 0.024810791015625, "learning_rate": 1.0674317316445523e-07, "loss": 0.0858, "num_tokens": 2422471910.0, "reward": 2.5323662757873535, "reward_std": 0.43594661355018616, "rewards/accuracy_reward/mean": 0.6361607313156128, "rewards/accuracy_reward/std": 0.4816409945487976, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.13763901591300964, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 907.7500610351562, "completions/mean_terminated_length": 731.4226684570312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9508284055191519, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1232258081316246, "kl": 0.029327392578125, "learning_rate": 1.0668555950145965e-07, "loss": 0.0908, "num_tokens": 2422941942.0, "reward": 2.4972100257873535, "reward_std": 0.43454083800315857, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13669590651988983, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 869.1451416015625, "completions/mean_terminated_length": 743.9827270507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9510415001864578, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14109911443051323, "kl": 0.02984619140625, "learning_rate": 1.0662819117907403e-07, "loss": 0.0824, "num_tokens": 2423403671.0, "reward": 2.650669813156128, "reward_std": 0.39399418234825134, "rewards/accuracy_reward/mean": 0.7388392686843872, "rewards/accuracy_reward/std": 0.43975841999053955, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1292569637298584, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1094.2835693359375, "completions/mean_terminated_length": 820.2269897460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9512545948537637, "frac_reward_zero_std": 0.0, "grad_norm": 0.18726304293999432, "kl": 0.029937744140625, "learning_rate": 1.0657106822904735e-07, "loss": 0.0525, "num_tokens": 2423970918.0, "reward": 2.2845983505249023, "reward_std": 0.5178691744804382, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.494759202003479, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9564732313156128, "rewards/tag_count_reward/std": 0.16317066550254822, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1005.6406860351562, "completions/mean_terminated_length": 782.4796752929688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9514676895210697, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13345664224154202, "kl": 0.02630615234375, "learning_rate": 1.0651419068299287e-07, "loss": 0.0794, "num_tokens": 2424488405.0, "reward": 2.4654018878936768, "reward_std": 0.4529860019683838, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1569942682981491, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 928.6027221679688, "completions/mean_terminated_length": 707.11767578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9516807841883757, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13083560485816573, "kl": 0.0272216796875, "learning_rate": 1.0645755857238787e-07, "loss": 0.0541, "num_tokens": 2424973811.0, "reward": 2.533482313156128, "reward_std": 0.37343981862068176, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11889871954917908, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 991.0313110351562, "completions/mean_terminated_length": 791.9734497070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9518938788556817, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1230067664681598, "kl": 0.027252197265625, "learning_rate": 1.064011719285739e-07, "loss": 0.0955, "num_tokens": 2425487745.0, "reward": 2.415736675262451, "reward_std": 0.4372251629829407, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16573180258274078, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 914.779052734375, "completions/mean_terminated_length": 729.3428344726562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9521069735229876, "frac_reward_zero_std": 0.25, "grad_norm": 0.11563039121539463, "kl": 0.0294189453125, "learning_rate": 1.0634503078275669e-07, "loss": 0.0567, "num_tokens": 2425970238.0, "reward": 2.5200893878936768, "reward_std": 0.34107157588005066, "rewards/accuracy_reward/mean": 0.6111111044883728, "rewards/accuracy_reward/std": 0.4880632162094116, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.09730303287506104, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1080.65625, "completions/mean_terminated_length": 844.1944580078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9523200681902936, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1144333209386932, "kl": 0.02490234375, "learning_rate": 1.0628913516600608e-07, "loss": 0.0603, "num_tokens": 2426525508.0, "reward": 2.412388563156128, "reward_std": 0.40310806035995483, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.09746808558702469, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 910.8303833007812, "completions/mean_terminated_length": 728.1761474609375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9525331628575995, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1306880959362473, "kl": 0.0303955078125, "learning_rate": 1.0623348510925593e-07, "loss": 0.1337, "num_tokens": 2426995304.0, "reward": 2.5887277126312256, "reward_std": 0.4283883273601532, "rewards/accuracy_reward/mean": 0.6897321343421936, "rewards/accuracy_reward/std": 0.46312037110328674, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13927440345287323, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1007.4688110351562, "completions/mean_terminated_length": 784.69921875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9527462575249055, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15743305872484828, "kl": 0.02874755859375, "learning_rate": 1.0617808064330438e-07, "loss": 0.1309, "num_tokens": 2427527258.0, "reward": 2.271763563156128, "reward_std": 0.4553113281726837, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14125031232833862, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 957.0692138671875, "completions/mean_terminated_length": 765.2257080078125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9529593521922114, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1390581248103235, "kl": 0.0286865234375, "learning_rate": 1.0612292179881346e-07, "loss": 0.1018, "num_tokens": 2428030585.0, "reward": 2.46875, "reward_std": 0.4433434009552002, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.10992966592311859, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 901.83935546875, "completions/mean_terminated_length": 734.7518920898438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9531724468595173, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14149662350534029, "kl": 0.02825927734375, "learning_rate": 1.0606800860630954e-07, "loss": 0.1067, "num_tokens": 2428504129.0, "reward": 2.4308037757873535, "reward_std": 0.42274850606918335, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119430512189865, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 935.5000610351562, "completions/mean_terminated_length": 732.96044921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9533855415268233, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12986227710193882, "kl": 0.02935791015625, "learning_rate": 1.0601334109618267e-07, "loss": 0.1072, "num_tokens": 2428988449.0, "reward": 2.498326063156128, "reward_std": 0.4107213616371155, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15853238105773926, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1015.8817138671875, "completions/mean_terminated_length": 749.1544799804688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9535986361941292, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14775375946306477, "kl": 0.0277099609375, "learning_rate": 1.0595891929868723e-07, "loss": 0.0765, "num_tokens": 2429511724.0, "reward": 2.2818081378936768, "reward_std": 0.47862571477890015, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.15853238105773926, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1117.29248046875, "completions/mean_terminated_length": 863.4630737304688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9538117308614352, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11815939336230749, "kl": 0.023529052734375, "learning_rate": 1.0590474324394155e-07, "loss": 0.0453, "num_tokens": 2430078079.0, "reward": 2.4051339626312256, "reward_std": 0.3865298628807068, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12062390148639679, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1102.2835693359375, "completions/mean_terminated_length": 834.0143432617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9540248255287411, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11567708777955636, "kl": 0.024261474609375, "learning_rate": 1.0585081296192788e-07, "loss": 0.0699, "num_tokens": 2430648110.0, "reward": 2.427455425262451, "reward_std": 0.41465842723846436, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 953.9375610351562, "completions/mean_terminated_length": 716.0978393554688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9542379201960471, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 5.333851376858454, "kl": 0.1162109375, "learning_rate": 1.057971284824925e-07, "loss": 0.0705, "num_tokens": 2431143938.0, "reward": 2.416294813156128, "reward_std": 0.338835209608078, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9363839030265808, "rewards/tag_count_reward/std": 0.2202649563550949, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1085.180908203125, "completions/mean_terminated_length": 812.0601806640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.954451014863353, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12293688416830006, "kl": 0.02545166015625, "learning_rate": 1.0574368983534565e-07, "loss": 0.0469, "num_tokens": 2431702323.0, "reward": 2.2901787757873535, "reward_std": 0.42563074827194214, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13583585619926453, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1118.274658203125, "completions/mean_terminated_length": 854.5415649414062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9546641095306589, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12656611165390563, "kl": 0.025299072265625, "learning_rate": 1.0569049705006161e-07, "loss": 0.1042, "num_tokens": 2432273230.0, "reward": 2.314174175262451, "reward_std": 0.49043822288513184, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14611589908599854, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 962.99560546875, "completions/mean_terminated_length": 768.8368530273438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.954877204197965, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14755371311635543, "kl": 0.0294189453125, "learning_rate": 1.0563755015607845e-07, "loss": 0.0923, "num_tokens": 2432774860.0, "reward": 2.377232313156128, "reward_std": 0.5108281970024109, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14955389499664307, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 911.8326416015625, "completions/mean_terminated_length": 697.859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9550902988652709, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.19361545496792237, "kl": 0.03546142578125, "learning_rate": 1.0558484918269823e-07, "loss": 0.0422, "num_tokens": 2433256801.0, "reward": 2.497767925262451, "reward_std": 0.4049406945705414, "rewards/accuracy_reward/mean": 0.6319444179534912, "rewards/accuracy_reward/std": 0.48283568024635315, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1045.9888916015625, "completions/mean_terminated_length": 814.7554931640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9553033935325769, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12363393297317277, "kl": 0.025543212890625, "learning_rate": 1.0553239415908689e-07, "loss": 0.0794, "num_tokens": 2433795500.0, "reward": 2.3359375, "reward_std": 0.4033602476119995, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1502326875925064, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1055.8616943359375, "completions/mean_terminated_length": 846.7081298828125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9555164881998828, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12489362531603199, "kl": 0.027252197265625, "learning_rate": 1.0548018511427429e-07, "loss": 0.0596, "num_tokens": 2434333662.0, "reward": 2.451451063156128, "reward_std": 0.4044599235057831, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14299827814102173, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1065.9732666015625, "completions/mean_terminated_length": 808.7098388671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9557295828671888, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1150258814940669, "kl": 0.024627685546875, "learning_rate": 1.0542822207715416e-07, "loss": 0.0659, "num_tokens": 2434879458.0, "reward": 2.419642925262451, "reward_std": 0.4273401200771332, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14530304074287415, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 957.2098388671875, "completions/mean_terminated_length": 758.6227416992188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9559426775344947, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13386721870679222, "kl": 0.026763916015625, "learning_rate": 1.0537650507648399e-07, "loss": 0.0814, "num_tokens": 2435374576.0, "reward": 2.459263563156128, "reward_std": 0.404101699590683, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12130890041589737, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 907.3928833007812, "completions/mean_terminated_length": 737.7640991210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9561557722018007, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13229215692504598, "kl": 0.030059814453125, "learning_rate": 1.0532503414088523e-07, "loss": 0.0638, "num_tokens": 2435842704.0, "reward": 2.4246652126312256, "reward_std": 0.3579663038253784, "rewards/accuracy_reward/mean": 0.4977678656578064, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.12151455134153366, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 975.16748046875, "completions/mean_terminated_length": 786.5065307617188, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9563688668691066, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12698549637986675, "kl": 0.028289794921875, "learning_rate": 1.0527380929884322e-07, "loss": 0.0784, "num_tokens": 2436343051.0, "reward": 2.560826063156128, "reward_std": 0.4783167839050293, "rewards/accuracy_reward/mean": 0.6763392686843872, "rewards/accuracy_reward/std": 0.46839532256126404, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.14097605645656586, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 992.4063110351562, "completions/mean_terminated_length": 780.155517578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9565819615364125, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13632784793579203, "kl": 0.02777099609375, "learning_rate": 1.0522283057870675e-07, "loss": 0.0886, "num_tokens": 2436859585.0, "reward": 2.4330358505249023, "reward_std": 0.3583570420742035, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.08746545761823654, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1001.0045166015625, "completions/mean_terminated_length": 823.31591796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9567950562037185, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12344958373449007, "kl": 0.030548095703125, "learning_rate": 1.0517209800868883e-07, "loss": 0.0588, "num_tokens": 2437375603.0, "reward": 2.533482313156128, "reward_std": 0.4051283001899719, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12083587795495987, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 975.4129638671875, "completions/mean_terminated_length": 783.4763793945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9570081508710244, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14000978165755174, "kl": 0.029998779296875, "learning_rate": 1.0512161161686593e-07, "loss": 0.0901, "num_tokens": 2437877644.0, "reward": 2.3989956378936768, "reward_std": 0.4502313733100891, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15647155046463013, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 951.607177734375, "completions/mean_terminated_length": 765.5352783203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.9572212455383304, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14111165238033374, "kl": 0.028350830078125, "learning_rate": 1.0507137143117852e-07, "loss": 0.0757, "num_tokens": 2438369740.0, "reward": 2.493861675262451, "reward_std": 0.438626229763031, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561823636293411, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1108.372802734375, "completions/mean_terminated_length": 831.372802734375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9574343402056363, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1224551444227295, "kl": 0.024139404296875, "learning_rate": 1.0502137747943063e-07, "loss": 0.0831, "num_tokens": 2438944099.0, "reward": 2.3604912757873535, "reward_std": 0.4305449426174164, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1396559476852417, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1040.2388916015625, "completions/mean_terminated_length": 769.0283203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9576474348729423, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.5186555658797078, "kl": 0.049163818359375, "learning_rate": 1.0497162978929006e-07, "loss": 0.1269, "num_tokens": 2439488270.0, "reward": 2.4056921005249023, "reward_std": 0.4129801094532013, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.500314474105835, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14469929039478302, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 992.5692138671875, "completions/mean_terminated_length": 816.6640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9578605295402483, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12198193137182425, "kl": 0.027069091796875, "learning_rate": 1.0492212838828843e-07, "loss": 0.113, "num_tokens": 2439995533.0, "reward": 2.560267925262451, "reward_std": 0.4273020029067993, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11849905550479889, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1023.357177734375, "completions/mean_terminated_length": 817.3297729492188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9580736242075542, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13344019344455174, "kl": 0.029571533203125, "learning_rate": 1.048728733038209e-07, "loss": 0.1146, "num_tokens": 2440522221.0, "reward": 2.4330358505249023, "reward_std": 0.49730271100997925, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.4940521717071533, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9441964030265808, "rewards/tag_count_reward/std": 0.1875898689031601, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 972.7053833007812, "completions/mean_terminated_length": 773.5767211914062, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.9582867188748602, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1315559309453804, "kl": 0.027435302734375, "learning_rate": 1.0482386456314645e-07, "loss": 0.0977, "num_tokens": 2441017673.0, "reward": 2.50390625, "reward_std": 0.4680311679840088, "rewards/accuracy_reward/mean": 0.6383928656578064, "rewards/accuracy_reward/std": 0.4810029864311218, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.29793688654899597, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.14748506247997284, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1062.8170166015625, "completions/mean_terminated_length": 864.723876953125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9584998135421661, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13636840728692867, "kl": 0.027801513671875, "learning_rate": 1.0477510219338761e-07, "loss": 0.0787, "num_tokens": 2441557687.0, "reward": 2.5066964626312256, "reward_std": 0.43101081252098083, "rewards/accuracy_reward/mean": 0.6049107313156128, "rewards/accuracy_reward/std": 0.4894163906574249, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1457662582397461, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1089.060302734375, "completions/mean_terminated_length": 864.51513671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9587129082094721, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12927991916379877, "kl": 0.0260009765625, "learning_rate": 1.0472658622153066e-07, "loss": 0.1102, "num_tokens": 2442123698.0, "reward": 2.321986675262451, "reward_std": 0.4250272512435913, "rewards/accuracy_reward/mean": 0.4352678656578064, "rewards/accuracy_reward/std": 0.4963463246822357, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13180458545684814, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 968.1339721679688, "completions/mean_terminated_length": 744.0107421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.958926002876778, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11612721789367826, "kl": 0.029144287109375, "learning_rate": 1.0467831667442545e-07, "loss": 0.0326, "num_tokens": 2442635166.0, "reward": 2.3956475257873535, "reward_std": 0.3881060779094696, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.14102916419506073, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1045.9398193359375, "completions/mean_terminated_length": 794.025146484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.959139097544084, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12721827847348982, "kl": 0.0238037109375, "learning_rate": 1.0463029357878548e-07, "loss": 0.0459, "num_tokens": 2443178883.0, "reward": 2.34375, "reward_std": 0.43537646532058716, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2854745090007782, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090012550354004, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 974.7188110351562, "completions/mean_terminated_length": 775.9629516601562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9593521922113899, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13129852377983098, "kl": 0.0286865234375, "learning_rate": 1.045825169611879e-07, "loss": 0.0786, "num_tokens": 2443681925.0, "reward": 2.330357313156128, "reward_std": 0.44588029384613037, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14619384706020355, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 886.0402221679688, "completions/mean_terminated_length": 713.2359008789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9595652868786959, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1428843302307251, "kl": 0.030853271484375, "learning_rate": 1.0453498684807333e-07, "loss": 0.0566, "num_tokens": 2444149207.0, "reward": 2.478236675262451, "reward_std": 0.42918145656585693, "rewards/accuracy_reward/mean": 0.5925925970077515, "rewards/accuracy_reward/std": 0.49192148447036743, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.1217813789844513, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1014.794677734375, "completions/mean_terminated_length": 807.0455932617188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9597783815460018, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12925826194484052, "kl": 0.02777099609375, "learning_rate": 1.0448770326574616e-07, "loss": 0.1167, "num_tokens": 2444683291.0, "reward": 2.3699777126312256, "reward_std": 0.42711836099624634, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14661914110183716, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1010.40185546875, "completions/mean_terminated_length": 801.7694702148438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9599914762133077, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 2.022418570712354, "kl": 0.09954833984375, "learning_rate": 1.0444066624037418e-07, "loss": 0.0849, "num_tokens": 2445209583.0, "reward": 2.380580425262451, "reward_std": 0.4589422345161438, "rewards/accuracy_reward/mean": 0.5066964030265808, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.14259286224842072, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 906.6295166015625, "completions/mean_terminated_length": 712.92431640625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9602045708806137, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14450923763817577, "kl": 0.029632568359375, "learning_rate": 1.0439387579798868e-07, "loss": 0.1002, "num_tokens": 2445683753.0, "reward": 2.4068081378936768, "reward_std": 0.3942721486091614, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5005589723587036, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.11993236839771271, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1208.665283203125, "completions/mean_terminated_length": 898.0856323242188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9604176655479196, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.18833848354670926, "kl": 0.02337646484375, "learning_rate": 1.0434733196448481e-07, "loss": 0.0444, "num_tokens": 2446300323.0, "reward": 2.3387277126312256, "reward_std": 0.44807061553001404, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10346972197294235, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 941.7076416015625, "completions/mean_terminated_length": 802.7261352539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9606307602152256, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1307990702578527, "kl": 0.0299072265625, "learning_rate": 1.0430103476562078e-07, "loss": 0.0601, "num_tokens": 2446789584.0, "reward": 2.4927456378936768, "reward_std": 0.3784714639186859, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1091761440038681, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 930.9241333007812, "completions/mean_terminated_length": 768.0767211914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9608438548825315, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.14240070136879604, "kl": 0.02880859375, "learning_rate": 1.0425498422701872e-07, "loss": 0.0569, "num_tokens": 2447281710.0, "reward": 2.4910714626312256, "reward_std": 0.3062003254890442, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493200302124, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09024729579687119, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 949.9598388671875, "completions/mean_terminated_length": 766.953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9610569495498376, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1283643509528888, "kl": 0.030670166015625, "learning_rate": 1.0420918037416405e-07, "loss": 0.0883, "num_tokens": 2447778604.0, "reward": 2.4760046005249023, "reward_std": 0.36500662565231323, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681798309087753, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1032.4107666015625, "completions/mean_terminated_length": 808.2615966796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9612700442171435, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11244210563867672, "kl": 0.025177001953125, "learning_rate": 1.0416362323240563e-07, "loss": 0.0542, "num_tokens": 2448316484.0, "reward": 2.3917412757873535, "reward_std": 0.37942928075790405, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1302192062139511, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 951.0558471679688, "completions/mean_terminated_length": 781.4252319335938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9614831388844494, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12907304274929046, "kl": 0.029052734375, "learning_rate": 1.041183128269559e-07, "loss": 0.0689, "num_tokens": 2448815437.0, "reward": 2.4068081378936768, "reward_std": 0.4706929326057434, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9536830186843872, "rewards/tag_count_reward/std": 0.17443691194057465, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1030.294677734375, "completions/mean_terminated_length": 863.7610473632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9616962335517554, "frac_reward_zero_std": 0.25, "grad_norm": 0.11913687669526882, "kl": 0.027069091796875, "learning_rate": 1.0407324918289062e-07, "loss": 0.0948, "num_tokens": 2449344033.0, "reward": 2.4603796005249023, "reward_std": 0.33730754256248474, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.0952264592051506, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1102.3460693359375, "completions/mean_terminated_length": 899.888916015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9619093282190613, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11906813841396985, "kl": 0.024261474609375, "learning_rate": 1.0402843232514919e-07, "loss": 0.0814, "num_tokens": 2449915660.0, "reward": 2.24609375, "reward_std": 0.4485597610473633, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13485698401927948, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 989.3906860351562, "completions/mean_terminated_length": 759.2581787109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9621224228863673, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14435496849812327, "kl": 0.027587890625, "learning_rate": 1.0398386227853424e-07, "loss": 0.1196, "num_tokens": 2450422379.0, "reward": 2.510044813156128, "reward_std": 0.47749942541122437, "rewards/accuracy_reward/mean": 0.6388888955116272, "rewards/accuracy_reward/std": 0.480879545211792, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14819122850894928, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 900.8839721679688, "completions/mean_terminated_length": 723.4948120117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9623355175536732, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12876539241636106, "kl": 0.0308837890625, "learning_rate": 1.039395390677119e-07, "loss": 0.0433, "num_tokens": 2450891415.0, "reward": 2.4927456378936768, "reward_std": 0.3339018225669861, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9860491156578064, "rewards/tag_count_reward/std": 0.08657421171665192, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1130.669677734375, "completions/mean_terminated_length": 893.6067504882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9625486122209792, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12696968904337438, "kl": 0.02569580078125, "learning_rate": 1.0389546271721169e-07, "loss": 0.0441, "num_tokens": 2451475907.0, "reward": 2.3387277126312256, "reward_std": 0.39402276277542114, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1226801648736, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 937.7522583007812, "completions/mean_terminated_length": 762.751953125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9627617068882851, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12296301672335991, "kl": 0.02825927734375, "learning_rate": 1.0385163325142645e-07, "loss": 0.0726, "num_tokens": 2451960356.0, "reward": 2.3934152126312256, "reward_std": 0.3388652205467224, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174957811832428, "rewards/tag_count_reward/mean": 0.9827008843421936, "rewards/tag_count_reward/std": 0.10230488330125809, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1121.82373046875, "completions/mean_terminated_length": 892.2144775390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9629748015555911, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13172194446205926, "kl": 0.025299072265625, "learning_rate": 1.0380805069461247e-07, "loss": 0.0988, "num_tokens": 2452531861.0, "reward": 2.279576063156128, "reward_std": 0.5081182718276978, "rewards/accuracy_reward/mean": 0.43287035822868347, "rewards/accuracy_reward/std": 0.4960475564002991, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.18185119330883026, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 928.77685546875, "completions/mean_terminated_length": 738.830322265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.963187896222897, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395025307125469, "kl": 0.029632568359375, "learning_rate": 1.0376471507088933e-07, "loss": 0.0692, "num_tokens": 2453017873.0, "reward": 2.5167412757873535, "reward_std": 0.3990950882434845, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9832589030265808, "rewards/tag_count_reward/std": 0.1057564839720726, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1051.6898193359375, "completions/mean_terminated_length": 851.3592529296875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9634009908902029, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11776002520932452, "kl": 0.025543212890625, "learning_rate": 1.0372162640423998e-07, "loss": 0.0689, "num_tokens": 2453568214.0, "reward": 2.345424175262451, "reward_std": 0.3817687928676605, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.15223345160484314, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 992.3638916015625, "completions/mean_terminated_length": 783.4946899414062, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.9636140855575089, "frac_reward_zero_std": 0.25, "grad_norm": 0.12764886347682394, "kl": 0.02728271484375, "learning_rate": 1.0367878471851078e-07, "loss": 0.104, "num_tokens": 2454079337.0, "reward": 2.5619421005249023, "reward_std": 0.3644016683101654, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.47969308495521545, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11170817166566849, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 957.63623046875, "completions/mean_terminated_length": 775.9088745117188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9638271802248148, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.16981841400533865, "kl": 0.02850341796875, "learning_rate": 1.0363619003741125e-07, "loss": 0.0551, "num_tokens": 2454584358.0, "reward": 2.3588171005249023, "reward_std": 0.403976172208786, "rewards/accuracy_reward/mean": 0.4598214328289032, "rewards/accuracy_reward/std": 0.49894019961357117, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12201692909002304, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 918.8772583007812, "completions/mean_terminated_length": 826.1473388671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9640402748921209, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1487468950018143, "kl": 0.0311279296875, "learning_rate": 1.0359384238451425e-07, "loss": 0.0589, "num_tokens": 2455070847.0, "reward": 2.420201063156128, "reward_std": 0.4080529510974884, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.141964390873909, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1121.3504638671875, "completions/mean_terminated_length": 834.143310546875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9642533695594268, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12331570402037917, "kl": 0.022674560546875, "learning_rate": 1.03551741783256e-07, "loss": 0.0639, "num_tokens": 2455648572.0, "reward": 2.234375, "reward_std": 0.41747739911079407, "rewards/accuracy_reward/mean": 0.3472222089767456, "rewards/accuracy_reward/std": 0.47663912177085876, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1063.8148193359375, "completions/mean_terminated_length": 826.6287841796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9644664642267328, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13160024643926505, "kl": 0.02655029296875, "learning_rate": 1.0350988825693606e-07, "loss": 0.0878, "num_tokens": 2456197513.0, "reward": 2.361607313156128, "reward_std": 0.44072797894477844, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.12312835454940796, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 981.07373046875, "completions/mean_terminated_length": 745.593994140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9646795588940387, "frac_reward_zero_std": 0.0, "grad_norm": 0.13488917479080317, "kl": 0.028106689453125, "learning_rate": 1.0346828182871701e-07, "loss": 0.1232, "num_tokens": 2456700458.0, "reward": 2.4614956378936768, "reward_std": 0.4898920953273773, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16087746620178223, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 988.1652221679688, "completions/mean_terminated_length": 778.4652709960938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9648926535613447, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14208512982695667, "kl": 0.029205322265625, "learning_rate": 1.0342692252162486e-07, "loss": 0.0965, "num_tokens": 2457210276.0, "reward": 2.4129464626312256, "reward_std": 0.4732236862182617, "rewards/accuracy_reward/mean": 0.5486111044883728, "rewards/accuracy_reward/std": 0.49820831418037415, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14997069537639618, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1048.1942138671875, "completions/mean_terminated_length": 807.2437744140625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9651057482286506, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11639074797214444, "kl": 0.025634765625, "learning_rate": 1.033858103585489e-07, "loss": 0.0691, "num_tokens": 2457748123.0, "reward": 2.353794813156128, "reward_std": 0.3608246147632599, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1314026117324829, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1011.888427734375, "completions/mean_terminated_length": 832.8743286132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9653188428959565, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13563916315316754, "kl": 0.027374267578125, "learning_rate": 1.0334494536224146e-07, "loss": 0.0688, "num_tokens": 2458273049.0, "reward": 2.521205425262451, "reward_std": 0.4386194348335266, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1219823807477951, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1009.51123046875, "completions/mean_terminated_length": 773.3616943359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9655319375632625, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14195993970076673, "kl": 0.026123046875, "learning_rate": 1.0330432755531823e-07, "loss": 0.0948, "num_tokens": 2458790078.0, "reward": 2.4017858505249023, "reward_std": 0.4572906792163849, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5004791617393494, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1531175673007965, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 952.2813110351562, "completions/mean_terminated_length": 752.796875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9657450322305684, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11964988566802205, "kl": 0.02716064453125, "learning_rate": 1.0326395696025808e-07, "loss": 0.0512, "num_tokens": 2459283964.0, "reward": 2.4893975257873535, "reward_std": 0.41433700919151306, "rewards/accuracy_reward/mean": 0.5758928656578064, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.09769836068153381, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1190.571533203125, "completions/mean_terminated_length": 989.796142578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9659581268978744, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1113744585120054, "kl": 0.022186279296875, "learning_rate": 1.0322383359940299e-07, "loss": 0.0725, "num_tokens": 2459890252.0, "reward": 2.3744421005249023, "reward_std": 0.48005563020706177, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.12471878528594971, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1010.8660888671875, "completions/mean_terminated_length": 781.9618530273438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9661712215651803, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13680666861839627, "kl": 0.02691650390625, "learning_rate": 1.0318395749495825e-07, "loss": 0.1173, "num_tokens": 2460414064.0, "reward": 2.3989956378936768, "reward_std": 0.41356807947158813, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9547991156578064, "rewards/tag_count_reward/std": 0.16820663213729858, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 976.4285888671875, "completions/mean_terminated_length": 801.0805053710938, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9663843162324863, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11915491317299454, "kl": 0.03057861328125, "learning_rate": 1.0314432866899214e-07, "loss": 0.0506, "num_tokens": 2460922272.0, "reward": 2.428013563156128, "reward_std": 0.42051228880882263, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316656589508, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15285542607307434, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 940.5781860351562, "completions/mean_terminated_length": 779.1381225585938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9665974108997922, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12732231780820708, "kl": 0.030120849609375, "learning_rate": 1.0310494714343628e-07, "loss": 0.076, "num_tokens": 2461409571.0, "reward": 2.368861675262451, "reward_std": 0.4147380292415619, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9581473469734192, "rewards/tag_count_reward/std": 0.16403579711914062, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 929.857177734375, "completions/mean_terminated_length": 729.7684326171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9668105055670981, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1328882438595577, "kl": 0.028839111328125, "learning_rate": 1.0306581294008524e-07, "loss": 0.045, "num_tokens": 2461901907.0, "reward": 2.34375, "reward_std": 0.4486410617828369, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.14910244941711426, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1038.9598388671875, "completions/mean_terminated_length": 812.8906860351562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9670236002344041, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13439032971320833, "kl": 0.02667236328125, "learning_rate": 1.0302692608059685e-07, "loss": 0.1374, "num_tokens": 2462432945.0, "reward": 2.40234375, "reward_std": 0.40621766448020935, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681798309087753, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1048.4866943359375, "completions/mean_terminated_length": 837.7783813476562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.96723669490171, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.9398485401626545, "kl": 0.02789306640625, "learning_rate": 1.0298828658649198e-07, "loss": 0.0987, "num_tokens": 2462979755.0, "reward": 2.3917412757873535, "reward_std": 0.49047356843948364, "rewards/accuracy_reward/mean": 0.5245535969734192, "rewards/accuracy_reward/std": 0.49995502829551697, "rewards/format_reward/mean": 0.9040178656578064, "rewards/format_reward/std": 0.29489603638648987, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15610112249851227, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1021.9107666015625, "completions/mean_terminated_length": 828.6683959960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9674497895690161, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14057511149569235, "kl": 0.026763916015625, "learning_rate": 1.0294989447915468e-07, "loss": 0.0938, "num_tokens": 2463507907.0, "reward": 2.431919813156128, "reward_std": 0.41900837421417236, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.12707509100437164, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1114.2098388671875, "completions/mean_terminated_length": 772.5792236328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.967662884236322, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11221753872712896, "kl": 0.02606201171875, "learning_rate": 1.02911749779832e-07, "loss": 0.0652, "num_tokens": 2464075121.0, "reward": 2.3828125, "reward_std": 0.35823947191238403, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12016765773296356, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 994.3303833007812, "completions/mean_terminated_length": 758.2622680664062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.967875978903628, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1349976569501988, "kl": 0.02874755859375, "learning_rate": 1.0287385250963412e-07, "loss": 0.062, "num_tokens": 2464586645.0, "reward": 2.3638393878936768, "reward_std": 0.5170214176177979, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401403427124, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3124580383300781, "rewards/tag_count_reward/mean": 0.9464285969734192, "rewards/tag_count_reward/std": 0.1814327985048294, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1027.5201416015625, "completions/mean_terminated_length": 848.0656127929688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9680890735709339, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13378693307174494, "kl": 0.026947021484375, "learning_rate": 1.028362026895343e-07, "loss": 0.0748, "num_tokens": 2465113086.0, "reward": 2.4375, "reward_std": 0.4152251183986664, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.14903545379638672, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1073.075927734375, "completions/mean_terminated_length": 841.464111328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9683021682382399, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11570952379882345, "kl": 0.025604248046875, "learning_rate": 1.0279880034036882e-07, "loss": 0.0439, "num_tokens": 2465661040.0, "reward": 2.361607313156128, "reward_std": 0.4305654466152191, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1512802243232727, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 965.2991333007812, "completions/mean_terminated_length": 788.1298828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9685152629055458, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1235048270019782, "kl": 0.028564453125, "learning_rate": 1.0276164548283702e-07, "loss": 0.0263, "num_tokens": 2466159414.0, "reward": 2.450892925262451, "reward_std": 0.3759031593799591, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971526861190796, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.09910815209150314, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1102.930908203125, "completions/mean_terminated_length": 903.7000122070312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9687283575728517, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11035869931876219, "kl": 0.022491455078125, "learning_rate": 1.0272473813750134e-07, "loss": 0.0321, "num_tokens": 2466723495.0, "reward": 2.388951063156128, "reward_std": 0.43316563963890076, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1395072489976883, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 903.7388916015625, "completions/mean_terminated_length": 769.6234741210938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9689414522401577, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1280188876211085, "kl": 0.03082275390625, "learning_rate": 1.0268807832478699e-07, "loss": 0.0659, "num_tokens": 2467190466.0, "reward": 2.525111675262451, "reward_std": 0.38166525959968567, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523807287216, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 933.2053833007812, "completions/mean_terminated_length": 764.1234130859375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9691545469074636, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1271939841540721, "kl": 0.027313232421875, "learning_rate": 1.026516660649825e-07, "loss": 0.055, "num_tokens": 2467682446.0, "reward": 2.470424175262451, "reward_std": 0.4126937985420227, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2254217267036438, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11780035495758057, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 952.8370971679688, "completions/mean_terminated_length": 763.6204223632812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9693676415747696, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13170114432518867, "kl": 0.030487060546875, "learning_rate": 1.0261550137823927e-07, "loss": 0.063, "num_tokens": 2468174629.0, "reward": 2.3543527126312256, "reward_std": 0.3728092908859253, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416089117527008, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 913.2076416015625, "completions/mean_terminated_length": 747.7774658203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9695807362420755, "frac_reward_zero_std": 0.0, "grad_norm": 0.13215448944842867, "kl": 0.031982421875, "learning_rate": 1.0257958428457169e-07, "loss": 0.0491, "num_tokens": 2468651106.0, "reward": 2.56640625, "reward_std": 0.3840310275554657, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.11898136883974075, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 959.5803833007812, "completions/mean_terminated_length": 784.7564697265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9697938309093815, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12489856807913137, "kl": 0.027374267578125, "learning_rate": 1.0254391480385704e-07, "loss": 0.0675, "num_tokens": 2469150470.0, "reward": 2.3833706378936768, "reward_std": 0.3548505902290344, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.4992803633213043, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824846744537354, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.11702417582273483, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1048.203125, "completions/mean_terminated_length": 834.1544799804688, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9700069255766874, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1438777115800212, "kl": 0.025604248046875, "learning_rate": 1.0250849295583575e-07, "loss": 0.0963, "num_tokens": 2469689137.0, "reward": 2.421875, "reward_std": 0.4538811445236206, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.4998401701450348, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1512802243232727, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 864.7924194335938, "completions/mean_terminated_length": 735.92822265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9702200202439933, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1314536151067218, "kl": 0.031402587890625, "learning_rate": 1.0247331876011102e-07, "loss": 0.0247, "num_tokens": 2470144292.0, "reward": 2.549107313156128, "reward_std": 0.41681161522865295, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4803536534309387, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1136813759803772, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1096.5804443359375, "completions/mean_terminated_length": 819.6541748046875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9704331149112994, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12417003315360922, "kl": 0.025177001953125, "learning_rate": 1.0243839223614909e-07, "loss": 0.0282, "num_tokens": 2470705624.0, "reward": 2.37890625, "reward_std": 0.4431673288345337, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.15557540953159332, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1038.321533203125, "completions/mean_terminated_length": 822.1572265625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9706462095786053, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11236099387145804, "kl": 0.026214599609375, "learning_rate": 1.0240371340327916e-07, "loss": 0.0634, "num_tokens": 2471237592.0, "reward": 2.431919813156128, "reward_std": 0.43935853242874146, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15750236809253693, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1035.368408203125, "completions/mean_terminated_length": 811.8719482421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9708593042459113, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12001272095026856, "kl": 0.026947021484375, "learning_rate": 1.023692822806933e-07, "loss": 0.0727, "num_tokens": 2471767325.0, "reward": 2.579799175262451, "reward_std": 0.40030691027641296, "rewards/accuracy_reward/mean": 0.6584821343421936, "rewards/accuracy_reward/std": 0.4747488796710968, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.14126798510551453, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 984.2991333007812, "completions/mean_terminated_length": 793.95263671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9710723989132172, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12931451861264318, "kl": 0.02813720703125, "learning_rate": 1.0233509888744648e-07, "loss": 0.0862, "num_tokens": 2472275763.0, "reward": 2.4525671005249023, "reward_std": 0.4722106456756592, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664569377899, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14418935775756836, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1028.578125, "completions/mean_terminated_length": 858.6744995117188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9712854935805232, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1302239957203249, "kl": 0.02630615234375, "learning_rate": 1.023011632424566e-07, "loss": 0.0827, "num_tokens": 2472809110.0, "reward": 2.4910714626312256, "reward_std": 0.5058425664901733, "rewards/accuracy_reward/mean": 0.6138392686843872, "rewards/accuracy_reward/std": 0.4874124526977539, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.1531175673007965, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 981.0156860351562, "completions/mean_terminated_length": 809.6347045898438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9714985882478291, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12474688090803428, "kl": 0.028106689453125, "learning_rate": 1.0226747536450442e-07, "loss": 0.0549, "num_tokens": 2473315469.0, "reward": 2.3973214626312256, "reward_std": 0.4472970962524414, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.10992966592311859, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 962.1317138671875, "completions/mean_terminated_length": 733.2189331054688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9717116829151351, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.35121812887412146, "kl": 0.0341796875, "learning_rate": 1.0223403527223367e-07, "loss": 0.0489, "num_tokens": 2473816424.0, "reward": 2.5, "reward_std": 0.4159477949142456, "rewards/accuracy_reward/mean": 0.6004464030265808, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.1249600425362587, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1056.7567138671875, "completions/mean_terminated_length": 817.8698120117188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.971924777582441, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12869293517339084, "kl": 0.02471923828125, "learning_rate": 1.0220084298415081e-07, "loss": 0.0527, "num_tokens": 2474355131.0, "reward": 2.3956475257873535, "reward_std": 0.4562458395957947, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15870553255081177, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 994.1785888671875, "completions/mean_terminated_length": 750.989013671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9721378722497469, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13230402152417267, "kl": 0.028961181640625, "learning_rate": 1.0216789851862526e-07, "loss": 0.0875, "num_tokens": 2474874651.0, "reward": 2.4213171005249023, "reward_std": 0.4332205355167389, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13416090607643127, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1111.680908203125, "completions/mean_terminated_length": 817.8797607421875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9723509669170529, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11665414636447166, "kl": 0.02392578125, "learning_rate": 1.021352018938892e-07, "loss": 0.0833, "num_tokens": 2475447884.0, "reward": 2.3348214626312256, "reward_std": 0.48968520760536194, "rewards/accuracy_reward/mean": 0.4330357015132904, "rewards/accuracy_reward/std": 0.4960494339466095, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.14672233164310455, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1009.5402221679688, "completions/mean_terminated_length": 737.492919921875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9725640615843588, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1426924409758636, "kl": 0.026702880859375, "learning_rate": 1.0210275312803782e-07, "loss": 0.0999, "num_tokens": 2475973582.0, "reward": 2.431361675262451, "reward_std": 0.370706707239151, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9893973469734192, "rewards/tag_count_reward/std": 0.08036740869283676, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1090.65185546875, "completions/mean_terminated_length": 853.3147583007812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9727771562516648, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12053280080268028, "kl": 0.0255126953125, "learning_rate": 1.0207055223902898e-07, "loss": 0.0836, "num_tokens": 2476533602.0, "reward": 2.337611675262451, "reward_std": 0.4732224941253662, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13748814165592194, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1028.6429443359375, "completions/mean_terminated_length": 823.6782836914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9729902509189707, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13058058956208124, "kl": 0.026519775390625, "learning_rate": 1.0203859924468339e-07, "loss": 0.0671, "num_tokens": 2477061170.0, "reward": 2.443638563156128, "reward_std": 0.49733930826187134, "rewards/accuracy_reward/mean": 0.5856481194496155, "rewards/accuracy_reward/std": 0.49318093061447144, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.16821405291557312, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 930.46435546875, "completions/mean_terminated_length": 757.6494750976562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9732033455862767, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14468255213054854, "kl": 0.03131103515625, "learning_rate": 1.0200689416268454e-07, "loss": 0.0355, "num_tokens": 2477548370.0, "reward": 2.4386162757873535, "reward_std": 0.4184619188308716, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.10404275357723236, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1041.3660888671875, "completions/mean_terminated_length": 822.5326538085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9734164402535826, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11402298933738973, "kl": 0.025634765625, "learning_rate": 1.0197543701057887e-07, "loss": 0.0587, "num_tokens": 2478082838.0, "reward": 2.373326063156128, "reward_std": 0.3784697353839874, "rewards/accuracy_reward/mean": 0.4308035671710968, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174957811832428, "rewards/tag_count_reward/mean": 0.9849330186843872, "rewards/tag_count_reward/std": 0.08956517279148102, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 941.4910888671875, "completions/mean_terminated_length": 757.0729370117188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9736295349208886, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.14566787121356425, "kl": 0.031463623046875, "learning_rate": 1.019442278057755e-07, "loss": 0.0356, "num_tokens": 2478578034.0, "reward": 2.3253350257873535, "reward_std": 0.40917083621025085, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9771205186843872, "rewards/tag_count_reward/std": 0.1091761440038681, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 950.0491333007812, "completions/mean_terminated_length": 776.9871215820312, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.9738426295881946, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1275142344408925, "kl": 0.03118896484375, "learning_rate": 1.0191326656554624e-07, "loss": 0.0561, "num_tokens": 2479071576.0, "reward": 2.5223214626312256, "reward_std": 0.42247384786605835, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.48893147706985474, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08709356933832169, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 982.7366333007812, "completions/mean_terminated_length": 778.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9740557242555005, "frac_reward_zero_std": 0.25, "grad_norm": 0.1374145416413502, "kl": 0.02734375, "learning_rate": 1.0188255330702583e-07, "loss": 0.0596, "num_tokens": 2479580066.0, "reward": 2.470982313156128, "reward_std": 0.34999993443489075, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.13361193239688873, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1067.622802734375, "completions/mean_terminated_length": 854.497314453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9742688189228065, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 7.422554205054337, "kl": 0.237060546875, "learning_rate": 1.018520880472117e-07, "loss": 0.0638, "num_tokens": 2480132409.0, "reward": 2.3203125, "reward_std": 0.4069780707359314, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330368638038635, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13189572095870972, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1163.46875, "completions/mean_terminated_length": 882.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9744819135901124, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.119016318684159, "kl": 0.023284912109375, "learning_rate": 1.0182187080296403e-07, "loss": 0.0789, "num_tokens": 2480732875.0, "reward": 2.3052456378936768, "reward_std": 0.4759340286254883, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49080711603164673, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.1552460491657257, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 976.43310546875, "completions/mean_terminated_length": 757.5107421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.9746950082574184, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11748737471838662, "kl": 0.027069091796875, "learning_rate": 1.0179190159100574e-07, "loss": 0.0591, "num_tokens": 2481230797.0, "reward": 2.5513393878936768, "reward_std": 0.3492380678653717, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1086503118276596, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1069.665283203125, "completions/mean_terminated_length": 781.2543334960938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9749081029247243, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12790316023937567, "kl": 0.026153564453125, "learning_rate": 1.0176218042792257e-07, "loss": 0.0705, "num_tokens": 2481780887.0, "reward": 2.3504464626312256, "reward_std": 0.46307647228240967, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547336578369, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9553571343421936, "rewards/tag_count_reward/std": 0.17041954398155212, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1056.8504638671875, "completions/mean_terminated_length": 779.3285522460938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9751211975920303, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1045995023309239, "kl": 0.025909423828125, "learning_rate": 1.017327073301628e-07, "loss": 0.042, "num_tokens": 2482327972.0, "reward": 2.4481027126312256, "reward_std": 0.4026567041873932, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9748883843421936, "rewards/tag_count_reward/std": 0.12870891392230988, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1190.80810546875, "completions/mean_terminated_length": 873.620849609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9753342922593362, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11929608074266028, "kl": 0.0224609375, "learning_rate": 1.0170348231403762e-07, "loss": 0.0587, "num_tokens": 2482933998.0, "reward": 2.1796875, "reward_std": 0.4283905327320099, "rewards/accuracy_reward/mean": 0.3013392984867096, "rewards/accuracy_reward/std": 0.4593527019023895, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1464626044034958, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 948.1272583007812, "completions/mean_terminated_length": 726.9732055664062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9755473869266421, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14064811189011178, "kl": 0.028045654296875, "learning_rate": 1.016745053957208e-07, "loss": 0.0678, "num_tokens": 2483431239.0, "reward": 2.3744421005249023, "reward_std": 0.4493979513645172, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.1456623673439026, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1106.3616943359375, "completions/mean_terminated_length": 818.10498046875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.9757604815939481, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12180798588477937, "kl": 0.025299072265625, "learning_rate": 1.0164577659124884e-07, "loss": 0.0548, "num_tokens": 2483992457.0, "reward": 2.3638393878936768, "reward_std": 0.40271928906440735, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.13480259478092194, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 990.9464721679688, "completions/mean_terminated_length": 757.645751953125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.975973576261254, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13191626035692133, "kl": 0.027130126953125, "learning_rate": 1.0161729591652094e-07, "loss": 0.1161, "num_tokens": 2484508913.0, "reward": 2.4408483505249023, "reward_std": 0.44707101583480835, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13709372282028198, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 909.1585083007812, "completions/mean_terminated_length": 749.7786254882812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.97618667092856, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12290649022163078, "kl": 0.0291748046875, "learning_rate": 1.0158906338729903e-07, "loss": 0.054, "num_tokens": 2484984184.0, "reward": 2.4402902126312256, "reward_std": 0.363080233335495, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396106719971, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1131737232208252, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 878.5647583007812, "completions/mean_terminated_length": 721.6531982421875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9763997655958659, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12968401178556244, "kl": 0.02911376953125, "learning_rate": 1.0156107901920756e-07, "loss": 0.0215, "num_tokens": 2485446485.0, "reward": 2.4034600257873535, "reward_std": 0.3069804310798645, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855974316596985, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18578432500362396, "rewards/tag_count_reward/mean": 0.9838169813156128, "rewards/tag_count_reward/std": 0.09244592487812042, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 920.9933471679688, "completions/mean_terminated_length": 763.2697143554688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.976612860263172, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14293294193855025, "kl": 0.029754638671875, "learning_rate": 1.015333428277338e-07, "loss": 0.0642, "num_tokens": 2485925250.0, "reward": 2.4029018878936768, "reward_std": 0.4738227427005768, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2918064594268799, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1576608121395111, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1044.544677734375, "completions/mean_terminated_length": 799.2555541992188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9768259549304779, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12787123560856678, "kl": 0.02716064453125, "learning_rate": 1.0150585482822757e-07, "loss": 0.0715, "num_tokens": 2486468262.0, "reward": 2.318638563156128, "reward_std": 0.4058946371078491, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13099703192710876, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1056.6875, "completions/mean_terminated_length": 782.7350463867188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9770390495977839, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10968157446980062, "kl": 0.024688720703125, "learning_rate": 1.014786150359014e-07, "loss": 0.0812, "num_tokens": 2487016842.0, "reward": 2.3510046005249023, "reward_std": 0.3474770188331604, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291378259658813, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.1098259910941124, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1040.0335693359375, "completions/mean_terminated_length": 817.5667724609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9772521442650898, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12099648631276447, "kl": 0.0257568359375, "learning_rate": 1.0145162346583037e-07, "loss": 0.078, "num_tokens": 2487555849.0, "reward": 2.4129464626312256, "reward_std": 0.4698973000049591, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9620535969734192, "rewards/tag_count_reward/std": 0.16112655401229858, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 844.7835083007812, "completions/mean_terminated_length": 693.6256103515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9774652389323957, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15932053810230012, "kl": 0.0350341796875, "learning_rate": 1.0142488013295241e-07, "loss": 0.0877, "num_tokens": 2488005288.0, "reward": 2.4213171005249023, "reward_std": 0.4079744517803192, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.1226801648736, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1046.857177734375, "completions/mean_terminated_length": 858.31298828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9776783335997017, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12455129273202828, "kl": 0.024261474609375, "learning_rate": 1.013983850520677e-07, "loss": 0.0507, "num_tokens": 2488558776.0, "reward": 2.5066964626312256, "reward_std": 0.45638835430145264, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967316269874573, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 853.8705444335938, "completions/mean_terminated_length": 669.2113037109375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9778914282670076, "frac_reward_zero_std": 0.25, "grad_norm": 0.1207488082318332, "kl": 0.031341552734375, "learning_rate": 1.0137213823783937e-07, "loss": 0.0634, "num_tokens": 2489010030.0, "reward": 2.510044813156128, "reward_std": 0.35181424021720886, "rewards/accuracy_reward/mean": 0.6026785969734192, "rewards/accuracy_reward/std": 0.48989060521125793, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1421017199754715, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1133.2835693359375, "completions/mean_terminated_length": 846.260986328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9781045229343136, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10599325655790867, "kl": 0.022796630859375, "learning_rate": 1.0134613970479301e-07, "loss": 0.0388, "num_tokens": 2489588333.0, "reward": 2.365513563156128, "reward_std": 0.3733319938182831, "rewards/accuracy_reward/mean": 0.46990740299224854, "rewards/accuracy_reward/std": 0.4996722936630249, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.13148215413093567, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 860.8772583007812, "completions/mean_terminated_length": 704.992431640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9783176176016195, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14047312865017636, "kl": 0.03009033203125, "learning_rate": 1.013203894673168e-07, "loss": 0.0675, "num_tokens": 2490038246.0, "reward": 2.4849331378936768, "reward_std": 0.3962576687335968, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12224180996417999, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 947.3616333007812, "completions/mean_terminated_length": 767.2571411132812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9785307122689255, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12733132700606797, "kl": 0.029327392578125, "learning_rate": 1.0129488753966151e-07, "loss": 0.0496, "num_tokens": 2490523560.0, "reward": 2.4603796005249023, "reward_std": 0.4324565827846527, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.13423532247543335, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 924.8013916015625, "completions/mean_terminated_length": 741.0051879882812, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9787438069362314, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.14259887359901344, "kl": 0.032470703125, "learning_rate": 1.0126963393594051e-07, "loss": 0.0807, "num_tokens": 2491008015.0, "reward": 2.5691964626312256, "reward_std": 0.45529991388320923, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.470055490732193, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.1523328423500061, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1029.247802734375, "completions/mean_terminated_length": 773.1368408203125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9789569016035373, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.122596561352527, "kl": 0.027923583984375, "learning_rate": 1.0124462867012975e-07, "loss": 0.082, "num_tokens": 2491532590.0, "reward": 2.4559152126312256, "reward_std": 0.375784695148468, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11067523807287216, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 852.6272583007812, "completions/mean_terminated_length": 705.8270874023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9791699962708433, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12906195772956755, "kl": 0.0325927734375, "learning_rate": 1.0121987175606772e-07, "loss": 0.0407, "num_tokens": 2491977511.0, "reward": 2.571986675262451, "reward_std": 0.37627726793289185, "rewards/accuracy_reward/mean": 0.6607142686843872, "rewards/accuracy_reward/std": 0.47399622201919556, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.1246887594461441, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1011.9063110351562, "completions/mean_terminated_length": 823.277099609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9793830909381492, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1282979030935075, "kl": 0.02593994140625, "learning_rate": 1.011953632074555e-07, "loss": 0.0267, "num_tokens": 2492501805.0, "reward": 2.3470983505249023, "reward_std": 0.4520379602909088, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.1478201001882553, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 911.2678833007812, "completions/mean_terminated_length": 735.4844970703125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9795961856054552, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1407619558062639, "kl": 0.02862548828125, "learning_rate": 1.0117110303785667e-07, "loss": 0.0202, "num_tokens": 2492975141.0, "reward": 2.458705425262451, "reward_std": 0.34212526679039, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.4992803931236267, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11632467061281204, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1026.01123046875, "completions/mean_terminated_length": 817.2177124023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9798092802727612, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12161803198252391, "kl": 0.028228759765625, "learning_rate": 1.0114709126069738e-07, "loss": 0.0764, "num_tokens": 2493500330.0, "reward": 2.4481027126312256, "reward_std": 0.4298285245895386, "rewards/accuracy_reward/mean": 0.5580357313156128, "rewards/accuracy_reward/std": 0.4971756041049957, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407156348228455, "rewards/tag_count_reward/mean": 0.9592633843421936, "rewards/tag_count_reward/std": 0.16000598669052124, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1112.7388916015625, "completions/mean_terminated_length": 833.5159912109375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9800223749400672, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13233298573463154, "kl": 0.023590087890625, "learning_rate": 1.0112332788926631e-07, "loss": 0.1167, "num_tokens": 2494073797.0, "reward": 2.3950893878936768, "reward_std": 0.4518040418624878, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11967317014932632, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1042.6273193359375, "completions/mean_terminated_length": 820.7329711914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9802354696073731, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13560477908488563, "kl": 0.027069091796875, "learning_rate": 1.010998129367147e-07, "loss": 0.0945, "num_tokens": 2494612846.0, "reward": 2.34765625, "reward_std": 0.4202481508255005, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12825222313404083, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 988.6563110351562, "completions/mean_terminated_length": 772.231201171875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9804485642746791, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1288056139294807, "kl": 0.02947998046875, "learning_rate": 1.010765464160562e-07, "loss": 0.0808, "num_tokens": 2495126180.0, "reward": 2.4263393878936768, "reward_std": 0.3949876129627228, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1086503118276596, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1091.243408203125, "completions/mean_terminated_length": 850.7178344726562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.980661658941985, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12712819782148058, "kl": 0.024444580078125, "learning_rate": 1.0105352834016717e-07, "loss": 0.1197, "num_tokens": 2495679793.0, "reward": 2.400669813156128, "reward_std": 0.4086795747280121, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.11899843066930771, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1023.8951416015625, "completions/mean_terminated_length": 843.8031616210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9808747536092909, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14247333059229758, "kl": 0.025482177734375, "learning_rate": 1.0103075872178624e-07, "loss": 0.0854, "num_tokens": 2496201938.0, "reward": 2.3599331378936768, "reward_std": 0.46483901143074036, "rewards/accuracy_reward/mean": 0.4665178656578064, "rewards/accuracy_reward/std": 0.4994353652000427, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13335825502872467, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 934.2188110351562, "completions/mean_terminated_length": 812.9158325195312, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9810878482765969, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1301817503346622, "kl": 0.027374267578125, "learning_rate": 1.0100823757351468e-07, "loss": 0.0463, "num_tokens": 2496693236.0, "reward": 2.5145089626312256, "reward_std": 0.4172426760196686, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936831295490265, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 897.8839721679688, "completions/mean_terminated_length": 740.2537841796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9813009429439028, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.15066847975770556, "kl": 0.0321044921875, "learning_rate": 1.0098596490781626e-07, "loss": 0.0622, "num_tokens": 2497171664.0, "reward": 2.51171875, "reward_std": 0.39012038707733154, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1041671633720398, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1234.9710693359375, "completions/mean_terminated_length": 976.7147216796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9815140376112088, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.10381698427275403, "kl": 0.02154541015625, "learning_rate": 1.0096394073701716e-07, "loss": 0.0598, "num_tokens": 2497798003.0, "reward": 2.322544813156128, "reward_std": 0.47268208861351013, "rewards/accuracy_reward/mean": 0.4486607015132904, "rewards/accuracy_reward/std": 0.49791330099105835, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.16375111043453217, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1064.555908203125, "completions/mean_terminated_length": 834.2727661132812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9817271322785147, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.4041997187709315, "kl": 0.03558349609375, "learning_rate": 1.0094216507330605e-07, "loss": 0.0733, "num_tokens": 2498349596.0, "reward": 2.4285714626312256, "reward_std": 0.4024399518966675, "rewards/accuracy_reward/mean": 0.5044642686843872, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093553841114044, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12607400119304657, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 908.0045166015625, "completions/mean_terminated_length": 751.7614135742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9819402269458207, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.15021375593534864, "kl": 0.02667236328125, "learning_rate": 1.0092063792873417e-07, "loss": 0.1023, "num_tokens": 2498825726.0, "reward": 2.4190850257873535, "reward_std": 0.3227223753929138, "rewards/accuracy_reward/mean": 0.48148149251937866, "rewards/accuracy_reward/std": 0.5002362728118896, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1616371124982834, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10210946202278137, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1027.290283203125, "completions/mean_terminated_length": 781.3019409179688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9821533216131266, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.11745036497856069, "kl": 0.02557373046875, "learning_rate": 1.0089935931521508e-07, "loss": 0.0819, "num_tokens": 2499356256.0, "reward": 2.4698662757873535, "reward_std": 0.3586699664592743, "rewards/accuracy_reward/mean": 0.5513392686843872, "rewards/accuracy_reward/std": 0.49791327118873596, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12292034178972244, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 991.9598388671875, "completions/mean_terminated_length": 796.3967895507812, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9823664162804325, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13498829912557073, "kl": 0.026824951171875, "learning_rate": 1.008783292445249e-07, "loss": 0.0652, "num_tokens": 2499875054.0, "reward": 2.4408483505249023, "reward_std": 0.41820070147514343, "rewards/accuracy_reward/mean": 0.5334821343421936, "rewards/accuracy_reward/std": 0.4994353950023651, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14262787997722626, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 909.919677734375, "completions/mean_terminated_length": 763.7178344726562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9825795109477385, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13713169826934785, "kl": 0.029632568359375, "learning_rate": 1.0085754772830213e-07, "loss": 0.0644, "num_tokens": 2500349610.0, "reward": 2.5970983505249023, "reward_std": 0.38837674260139465, "rewards/accuracy_reward/mean": 0.6919642686843872, "rewards/accuracy_reward/std": 0.46219751238822937, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.13083133101463318, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1118.1785888671875, "completions/mean_terminated_length": 829.98828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9827926056150444, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11933252141063329, "kl": 0.02288818359375, "learning_rate": 1.0083701477804778e-07, "loss": 0.0623, "num_tokens": 2500928010.0, "reward": 2.341517925262451, "reward_std": 0.47632312774658203, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.15048591792583466, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 965.450927734375, "completions/mean_terminated_length": 798.04638671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9830057002823505, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14426238466822397, "kl": 0.027313232421875, "learning_rate": 1.0081673040512528e-07, "loss": 0.0882, "num_tokens": 2501428276.0, "reward": 2.5853796005249023, "reward_std": 0.4392961263656616, "rewards/accuracy_reward/mean": 0.6674107313156128, "rewards/accuracy_reward/std": 0.47166746854782104, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11561822891235352, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 916.4063110351562, "completions/mean_terminated_length": 724.3603515625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9832187949496564, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13164649836017533, "kl": 0.02789306640625, "learning_rate": 1.0079669462076038e-07, "loss": 0.0466, "num_tokens": 2501899546.0, "reward": 2.6400671005249023, "reward_std": 0.3391387164592743, "rewards/accuracy_reward/mean": 0.7098214030265808, "rewards/accuracy_reward/std": 0.4543519914150238, "rewards/format_reward/mean": 0.9486607313156128, "rewards/format_reward/std": 0.22093555331230164, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.0950164794921875, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 950.8906860351562, "completions/mean_terminated_length": 768.0390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9834318896169624, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13825253741956947, "kl": 0.028564453125, "learning_rate": 1.0077690743604151e-07, "loss": 0.0722, "num_tokens": 2502387833.0, "reward": 2.482701063156128, "reward_std": 0.36174842715263367, "rewards/accuracy_reward/mean": 0.5915178656578064, "rewards/accuracy_reward/std": 0.49210265278816223, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822286784648895, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.1131737232208252, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1017.6317138671875, "completions/mean_terminated_length": 800.4189453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9836449842842683, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12025794610810049, "kl": 0.027008056640625, "learning_rate": 1.0075736886191923e-07, "loss": 0.0699, "num_tokens": 2502920884.0, "reward": 2.4609375, "reward_std": 0.40975409746170044, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1286761611700058, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1011.6406860351562, "completions/mean_terminated_length": 789.7642211914062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9838580789515743, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13748170244070565, "kl": 0.02825927734375, "learning_rate": 1.0073807890920672e-07, "loss": 0.0477, "num_tokens": 2503452307.0, "reward": 2.446986675262451, "reward_std": 0.4406866133213043, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509716033935547, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9603794813156128, "rewards/tag_count_reward/std": 0.1503853052854538, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 995.7433471679688, "completions/mean_terminated_length": 749.3471069335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9840711736188802, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11010353512670458, "kl": 0.027984619140625, "learning_rate": 1.0071903758857942e-07, "loss": 0.061, "num_tokens": 2503964544.0, "reward": 2.3627233505249023, "reward_std": 0.4092611074447632, "rewards/accuracy_reward/mean": 0.4722222089767456, "rewards/accuracy_reward/std": 0.49980661273002625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14360485970973969, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 994.6339721679688, "completions/mean_terminated_length": 799.5661010742188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9842842682861861, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13308786196167907, "kl": 0.026397705078125, "learning_rate": 1.0070024491057531e-07, "loss": 0.1027, "num_tokens": 2504472860.0, "reward": 2.41796875, "reward_std": 0.4522850513458252, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9626116156578064, "rewards/tag_count_reward/std": 0.1581934094429016, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 842.2857666015625, "completions/mean_terminated_length": 694.2155151367188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9844973629534921, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.14670130252315636, "kl": 0.032196044921875, "learning_rate": 1.0068170088559468e-07, "loss": 0.059, "num_tokens": 2504929516.0, "reward": 2.5379464626312256, "reward_std": 0.41874274611473083, "rewards/accuracy_reward/mean": 0.6116071343421936, "rewards/accuracy_reward/std": 0.4879295527935028, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10688954591751099, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1022.2031860351562, "completions/mean_terminated_length": 848.1123046875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.984710457620798, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12000329455197178, "kl": 0.025909423828125, "learning_rate": 1.0066340552390021e-07, "loss": 0.0661, "num_tokens": 2505457735.0, "reward": 2.463169813156128, "reward_std": 0.4484398663043976, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422912120819, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9676339030265808, "rewards/tag_count_reward/std": 0.13507550954818726, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 895.5379638671875, "completions/mean_terminated_length": 737.5863037109375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.984923552288104, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.15414747514470606, "kl": 0.02978515625, "learning_rate": 1.0064535883561705e-07, "loss": 0.1071, "num_tokens": 2505932280.0, "reward": 2.5474331378936768, "reward_std": 0.3957427740097046, "rewards/accuracy_reward/mean": 0.6643518805503845, "rewards/accuracy_reward/std": 0.4727640450000763, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.13802284002304077, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 900.2188110351562, "completions/mean_terminated_length": 778.3555908203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9851366469554099, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12845038849644544, "kl": 0.03033447265625, "learning_rate": 1.0062756083073256e-07, "loss": 0.0467, "num_tokens": 2506400362.0, "reward": 2.4849331378936768, "reward_std": 0.3705940842628479, "rewards/accuracy_reward/mean": 0.5825892686843872, "rewards/accuracy_reward/std": 0.4936830997467041, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.1127205565571785, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 943.4285888671875, "completions/mean_terminated_length": 772.6185302734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9853497416227159, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13469290304824677, "kl": 0.0286865234375, "learning_rate": 1.0061001151909662e-07, "loss": 0.0485, "num_tokens": 2506889450.0, "reward": 2.5357143878936768, "reward_std": 0.3804994821548462, "rewards/accuracy_reward/mean": 0.6160714030265808, "rewards/accuracy_reward/std": 0.48688453435897827, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9799107313156128, "rewards/tag_count_reward/std": 0.11036036163568497, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 962.013427734375, "completions/mean_terminated_length": 816.2987670898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9855628362900218, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.15586313497081986, "kl": 0.030303955078125, "learning_rate": 1.0059271091042145e-07, "loss": 0.0677, "num_tokens": 2507397360.0, "reward": 2.513392925262451, "reward_std": 0.5189526081085205, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.1353386640548706, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 951.8504638671875, "completions/mean_terminated_length": 779.0723876953125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9857759309573277, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12537519947580655, "kl": 0.028778076171875, "learning_rate": 1.005756590142816e-07, "loss": 0.092, "num_tokens": 2507891789.0, "reward": 2.510044813156128, "reward_std": 0.4289758503437042, "rewards/accuracy_reward/mean": 0.6294642686843872, "rewards/accuracy_reward/std": 0.48348814249038696, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9587053656578064, "rewards/tag_count_reward/std": 0.1662929803133011, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 973.9933471679688, "completions/mean_terminated_length": 768.3323974609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9859890256246338, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1400751997365586, "kl": 0.02630615234375, "learning_rate": 1.00558855840114e-07, "loss": 0.0999, "num_tokens": 2508400586.0, "reward": 2.4229912757873535, "reward_std": 0.46192002296447754, "rewards/accuracy_reward/mean": 0.5290178656578064, "rewards/accuracy_reward/std": 0.49971529841423035, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1599874496459961, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1098.7076416015625, "completions/mean_terminated_length": 869.9307250976562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9862021202919397, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11808584481360845, "kl": 0.0238037109375, "learning_rate": 1.005423013972179e-07, "loss": 0.0501, "num_tokens": 2508967175.0, "reward": 2.357142925262451, "reward_std": 0.44420138001441956, "rewards/accuracy_reward/mean": 0.4397321343421936, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10013572871685028, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1105.7098388671875, "completions/mean_terminated_length": 848.7216186523438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9864152149592457, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.11702725180264857, "kl": 0.02337646484375, "learning_rate": 1.0052599569475489e-07, "loss": 0.0427, "num_tokens": 2509538293.0, "reward": 2.404576063156128, "reward_std": 0.4085484445095062, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 992.8482666015625, "completions/mean_terminated_length": 816.9896240234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9866283096265516, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13375888588934118, "kl": 0.0289306640625, "learning_rate": 1.0050993874174902e-07, "loss": 0.097, "num_tokens": 2510045265.0, "reward": 2.4129464626312256, "reward_std": 0.4630640745162964, "rewards/accuracy_reward/mean": 0.5133928656578064, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.1299937218427658, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1063.446533203125, "completions/mean_terminated_length": 805.5211181640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.9868414042938576, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.13318615237794618, "kl": 0.027679443359375, "learning_rate": 1.0049413054708648e-07, "loss": 0.0862, "num_tokens": 2510603977.0, "reward": 2.3761162757873535, "reward_std": 0.5042670369148254, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.8950892686843872, "rewards/format_reward/std": 0.3067809045314789, "rewards/tag_count_reward/mean": 0.9430803656578064, "rewards/tag_count_reward/std": 0.18948033452033997, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1013.6920166015625, "completions/mean_terminated_length": 812.3466796875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9870544989611635, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.12440073149268047, "kl": 0.0257568359375, "learning_rate": 1.0047857111951591e-07, "loss": 0.0582, "num_tokens": 2511132255.0, "reward": 2.435826063156128, "reward_std": 0.39106255769729614, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.13894234597682953, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1066.33935546875, "completions/mean_terminated_length": 787.8739624023438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9872675936284695, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.10502918809200727, "kl": 0.024810791015625, "learning_rate": 1.0046326046764833e-07, "loss": 0.0421, "num_tokens": 2511684167.0, "reward": 2.3314733505249023, "reward_std": 0.3549799621105194, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.1292569637298584, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 985.3125610351562, "completions/mean_terminated_length": 798.4356689453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9874806882957754, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13393003132771097, "kl": 0.029205322265625, "learning_rate": 1.0044819859995701e-07, "loss": 0.1131, "num_tokens": 2512197987.0, "reward": 2.4347100257873535, "reward_std": 0.4993771016597748, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9129464030265808, "rewards/format_reward/std": 0.2822287082672119, "rewards/tag_count_reward/mean": 0.9614955186843872, "rewards/tag_count_reward/std": 0.14691685140132904, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 966.82373046875, "completions/mean_terminated_length": 735.352294921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9876937829630813, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14013623607782247, "kl": 0.02880859375, "learning_rate": 1.0043338552477749e-07, "loss": 0.0454, "num_tokens": 2512697412.0, "reward": 2.392857313156128, "reward_std": 0.3513668477535248, "rewards/accuracy_reward/mean": 0.4754464328289032, "rewards/accuracy_reward/std": 0.4999549984931946, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9732142686843872, "rewards/tag_count_reward/std": 0.12449962645769119, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1054.484375, "completions/mean_terminated_length": 790.6694946289062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9879068776303873, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1515832031858971, "kl": 0.025665283203125, "learning_rate": 1.0041882125030765e-07, "loss": 0.0709, "num_tokens": 2513240061.0, "reward": 2.3247768878936768, "reward_std": 0.39368030428886414, "rewards/accuracy_reward/mean": 0.44675925374031067, "rewards/accuracy_reward/std": 0.4977337718009949, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9631696343421936, "rewards/tag_count_reward/std": 0.15247619152069092, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1102.040283203125, "completions/mean_terminated_length": 823.1734008789062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9881199722976932, "frac_reward_zero_std": 0.0, "grad_norm": 0.12121189322529195, "kl": 0.02593994140625, "learning_rate": 1.0040450578460779e-07, "loss": 0.0297, "num_tokens": 2513800559.0, "reward": 2.3325893878936768, "reward_std": 0.434671550989151, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.11119430512189865, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 937.21435546875, "completions/mean_terminated_length": 728.0211791992188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9883330669649992, "frac_reward_zero_std": 0.0, "grad_norm": 0.14604665733245753, "kl": 0.03173828125, "learning_rate": 1.0039043913560035e-07, "loss": 0.1058, "num_tokens": 2514289039.0, "reward": 2.4949777126312256, "reward_std": 0.44186243414878845, "rewards/accuracy_reward/mean": 0.5959821343421936, "rewards/accuracy_reward/std": 0.49124953150749207, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13622933626174927, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1084.466552734375, "completions/mean_terminated_length": 858.8457641601562, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9885461616323051, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.19393625726608374, "kl": 0.02923583984375, "learning_rate": 1.0037662131107016e-07, "loss": 0.1093, "num_tokens": 2514845280.0, "reward": 2.4135046005249023, "reward_std": 0.4961911141872406, "rewards/accuracy_reward/mean": 0.5401785969734192, "rewards/accuracy_reward/std": 0.49894022941589355, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9559151530265808, "rewards/tag_count_reward/std": 0.17015470564365387, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1055.046875, "completions/mean_terminated_length": 822.5372314453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9887592562996111, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11947982399467978, "kl": 0.0230712890625, "learning_rate": 1.0036305231866438e-07, "loss": 0.0543, "num_tokens": 2515390485.0, "reward": 2.4603796005249023, "reward_std": 0.3660542368888855, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.49958035349845886, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.11440251022577286, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 998.04248046875, "completions/mean_terminated_length": 769.790771484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.988972350966917, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1303225026000214, "kl": 0.0291748046875, "learning_rate": 1.0034973216589234e-07, "loss": 0.077, "num_tokens": 2515906584.0, "reward": 2.2857143878936768, "reward_std": 0.4210096001625061, "rewards/accuracy_reward/mean": 0.4129464328289032, "rewards/accuracy_reward/std": 0.49291375279426575, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9575892686843872, "rewards/tag_count_reward/std": 0.16685132682323456, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1059.341552734375, "completions/mean_terminated_length": 803.8455200195312, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.989185445634223, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1254670143802508, "kl": 0.027740478515625, "learning_rate": 1.0033666086012573e-07, "loss": 0.0727, "num_tokens": 2516448321.0, "reward": 2.411830425262451, "reward_std": 0.4222189784049988, "rewards/accuracy_reward/mean": 0.5200892686843872, "rewards/accuracy_reward/std": 0.5001547932624817, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.12648425996303558, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1033.19873046875, "completions/mean_terminated_length": 781.6183471679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.989398540301529, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14364224331612194, "kl": 0.0291748046875, "learning_rate": 1.0032383840859857e-07, "loss": 0.0845, "num_tokens": 2516982058.0, "reward": 2.428013563156128, "reward_std": 0.5446627736091614, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9525669813156128, "rewards/tag_count_reward/std": 0.17413607239723206, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1066.47998046875, "completions/mean_terminated_length": 805.8502807617188, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9896116349688349, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1189858741285944, "kl": 0.026763916015625, "learning_rate": 1.003112648184071e-07, "loss": 0.0607, "num_tokens": 2517531297.0, "reward": 2.353794813156128, "reward_std": 0.40211236476898193, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2979368567466736, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.10528324544429779, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1058.84375, "completions/mean_terminated_length": 813.6211547851562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9898247296361409, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11514824755009852, "kl": 0.02667236328125, "learning_rate": 1.0029894009650974e-07, "loss": 0.0529, "num_tokens": 2518082235.0, "reward": 2.3978796005249023, "reward_std": 0.39937183260917664, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9782366156578064, "rewards/tag_count_reward/std": 0.10681798309087753, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 933.7344360351562, "completions/mean_terminated_length": 777.7938842773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9900378243034468, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12217828664081505, "kl": 0.030426025390625, "learning_rate": 1.002868642497274e-07, "loss": 0.0751, "num_tokens": 2518569012.0, "reward": 2.59375, "reward_std": 0.41784849762916565, "rewards/accuracy_reward/mean": 0.7008928656578064, "rewards/accuracy_reward/std": 0.45837873220443726, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.15090014040470123, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1039.622802734375, "completions/mean_terminated_length": 749.8591918945312, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9902509189707528, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13056884542073685, "kl": 0.025787353515625, "learning_rate": 1.002750372847431e-07, "loss": 0.1193, "num_tokens": 2519103355.0, "reward": 2.4140625, "reward_std": 0.37828412652015686, "rewards/accuracy_reward/mean": 0.4866071343421936, "rewards/accuracy_reward/std": 0.5003793835639954, "rewards/format_reward/mean": 0.9508928656578064, "rewards/format_reward/std": 0.2163332849740982, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11709482222795486, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1040.7991943359375, "completions/mean_terminated_length": 844.7306518554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9904640136380587, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.12949923624707887, "kl": 0.025970458984375, "learning_rate": 1.0026345920810216e-07, "loss": 0.0687, "num_tokens": 2519636305.0, "reward": 2.43359375, "reward_std": 0.5189981460571289, "rewards/accuracy_reward/mean": 0.5424107313156128, "rewards/accuracy_reward/std": 0.49875500798225403, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.9670758843421936, "rewards/tag_count_reward/std": 0.1395251452922821, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1027.8504638671875, "completions/mean_terminated_length": 742.2085571289062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9906771083053647, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1276057211297755, "kl": 0.026641845703125, "learning_rate": 1.0025213002621215e-07, "loss": 0.0685, "num_tokens": 2520170670.0, "reward": 2.392299175262451, "reward_std": 0.4549683928489685, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342794418335, "rewards/format_reward/mean": 0.9151785969734192, "rewards/format_reward/std": 0.2789272665977478, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14515583217144012, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 908.4308471679688, "completions/mean_terminated_length": 738.9564208984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9908902029726706, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13808139649512807, "kl": 0.029754638671875, "learning_rate": 1.0024104974534288e-07, "loss": 0.0539, "num_tokens": 2520644767.0, "reward": 2.4386162757873535, "reward_std": 0.3968634009361267, "rewards/accuracy_reward/mean": 0.5446428656578064, "rewards/accuracy_reward/std": 0.49855971336364746, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.12132563441991806, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1019.857177734375, "completions/mean_terminated_length": 772.0775146484375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9911032976399765, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.13312911762656093, "kl": 0.02691650390625, "learning_rate": 1.0023021837162648e-07, "loss": 0.0777, "num_tokens": 2521178495.0, "reward": 2.3208706378936768, "reward_std": 0.4349842369556427, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.49542489647865295, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.14611591398715973, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1020.9576416015625, "completions/mean_terminated_length": 824.2898559570312, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9913163923072825, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12001805449988731, "kl": 0.025970458984375, "learning_rate": 1.0021963591105725e-07, "loss": 0.1046, "num_tokens": 2521706700.0, "reward": 2.47265625, "reward_std": 0.4653957486152649, "rewards/accuracy_reward/mean": 0.5848214030265808, "rewards/accuracy_reward/std": 0.49330365657806396, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9637276530265808, "rewards/tag_count_reward/std": 0.15578390657901764, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 980.372802734375, "completions/mean_terminated_length": 762.25537109375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9915294869745884, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12421617497083236, "kl": 0.0296630859375, "learning_rate": 1.0020930236949182e-07, "loss": 0.0619, "num_tokens": 2522220323.0, "reward": 2.44140625, "reward_std": 0.34071144461631775, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5002396702766418, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9815848469734192, "rewards/tag_count_reward/std": 0.10210946202278137, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1061.47998046875, "completions/mean_terminated_length": 766.9536743164062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9917425816418944, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1286897519914757, "kl": 0.0269775390625, "learning_rate": 1.0019921775264897e-07, "loss": 0.0737, "num_tokens": 2522767226.0, "reward": 2.458705425262451, "reward_std": 0.468057245016098, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.1502326875925064, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 1034.747802734375, "completions/mean_terminated_length": 797.48486328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9919556763092003, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13345709151656257, "kl": 0.0274658203125, "learning_rate": 1.0018938206610979e-07, "loss": 0.0703, "num_tokens": 2523301769.0, "reward": 2.35546875, "reward_std": 0.4986642599105835, "rewards/accuracy_reward/mean": 0.5022321343421936, "rewards/accuracy_reward/std": 0.5005539655685425, "rewards/format_reward/mean": 0.8883928656578064, "rewards/format_reward/std": 0.31523454189300537, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.141964390873909, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 903.05810546875, "completions/mean_terminated_length": 736.1483154296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9921687709765064, "frac_reward_zero_std": 0.25, "grad_norm": 0.12642127426653382, "kl": 0.030731201171875, "learning_rate": 1.0017979531531753e-07, "loss": 0.0672, "num_tokens": 2523773379.0, "reward": 2.5535714626312256, "reward_std": 0.35648635029792786, "rewards/accuracy_reward/mean": 0.6517857313156128, "rewards/accuracy_reward/std": 0.476936936378479, "rewards/format_reward/mean": 0.9263392686843872, "rewards/format_reward/std": 0.2615099549293518, "rewards/tag_count_reward/mean": 0.9754464030265808, "rewards/tag_count_reward/std": 0.12270178645849228, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 965.4464721679688, "completions/mean_terminated_length": 785.0208740234375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.9923818656438123, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1414723323529596, "kl": 0.028839111328125, "learning_rate": 1.0017045750557779e-07, "loss": 0.0787, "num_tokens": 2524279579.0, "reward": 2.459263563156128, "reward_std": 0.4104117751121521, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12605296075344086, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 870.4576416015625, "completions/mean_terminated_length": 705.6615600585938, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.9925949603111183, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13522805651467137, "kl": 0.0321044921875, "learning_rate": 1.001613686420583e-07, "loss": 0.0695, "num_tokens": 2524733256.0, "reward": 2.5831475257873535, "reward_std": 0.40559300780296326, "rewards/accuracy_reward/mean": 0.6785714030265808, "rewards/accuracy_reward/std": 0.4675469994544983, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.2651226818561554, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10854540765285492, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1012.3192138671875, "completions/mean_terminated_length": 744.6713256835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9928080549784242, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.15724317004364333, "kl": 0.03277587890625, "learning_rate": 1.0015252872978905e-07, "loss": 0.0637, "num_tokens": 2525257095.0, "reward": 2.4581475257873535, "reward_std": 0.3580390512943268, "rewards/accuracy_reward/mean": 0.5379464030265808, "rewards/accuracy_reward/std": 0.49911534786224365, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9760044813156128, "rewards/tag_count_reward/std": 0.12109260261058807, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 995.3683471679688, "completions/mean_terminated_length": 773.462158203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9930211496457301, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.15067789642282367, "kl": 0.032745361328125, "learning_rate": 1.0014393777366226e-07, "loss": 0.0927, "num_tokens": 2525775932.0, "reward": 2.416294813156128, "reward_std": 0.3938027620315552, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21160738170146942, "rewards/tag_count_reward/mean": 0.9743303656578064, "rewards/tag_count_reward/std": 0.1302192062139511, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1044.2701416015625, "completions/mean_terminated_length": 842.44775390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9932342443130361, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12689424555589238, "kl": 0.029296875, "learning_rate": 1.0013559577843237e-07, "loss": 0.0803, "num_tokens": 2526316437.0, "reward": 2.3677456378936768, "reward_std": 0.5028325319290161, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.8995535969734192, "rewards/format_reward/std": 0.30093035101890564, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15587201714515686, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 955.4420166015625, "completions/mean_terminated_length": 766.6754150390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.993447338980342, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.13034694485164006, "kl": 0.032440185546875, "learning_rate": 1.0012750274871603e-07, "loss": 0.0551, "num_tokens": 2526816875.0, "reward": 2.4949777126312256, "reward_std": 0.521579384803772, "rewards/accuracy_reward/mean": 0.6205357313156128, "rewards/accuracy_reward/std": 0.48579615354537964, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.9659598469734192, "rewards/tag_count_reward/std": 0.1508246213197708, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 993.1116333007812, "completions/mean_terminated_length": 804.3421630859375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.993660433647648, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.13244162705370577, "kl": 0.02734375, "learning_rate": 1.0011965868899214e-07, "loss": 0.0852, "num_tokens": 2527332877.0, "reward": 2.3800225257873535, "reward_std": 0.41356196999549866, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9693080186843872, "rewards/tag_count_reward/std": 0.1339094191789627, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 979.8348388671875, "completions/mean_terminated_length": 795.28271484375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9938735283149539, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1314969423834263, "kl": 0.029632568359375, "learning_rate": 1.0011206360360177e-07, "loss": 0.0186, "num_tokens": 2527840739.0, "reward": 2.4693081378936768, "reward_std": 0.4105337858200073, "rewards/accuracy_reward/mean": 0.6203703880310059, "rewards/accuracy_reward/std": 0.48585736751556396, "rewards/format_reward/mean": 0.8973214030265808, "rewards/format_reward/std": 0.30387789011001587, "rewards/tag_count_reward/mean": 0.9737723469734192, "rewards/tag_count_reward/std": 0.11222106218338013, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 983.0670166015625, "completions/mean_terminated_length": 834.030517578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9940866229822599, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3185893363923411, "kl": 0.02874755859375, "learning_rate": 1.0010471749674815e-07, "loss": 0.0655, "num_tokens": 2528356449.0, "reward": 2.3158483505249023, "reward_std": 0.43175843358039856, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11589459329843521, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 868.2678833007812, "completions/mean_terminated_length": 723.3884887695312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9942997176495658, "frac_reward_zero_std": 0.1785714328289032, "grad_norm": 0.1288605359348323, "kl": 0.029541015625, "learning_rate": 1.0009762037249691e-07, "loss": 0.0632, "num_tokens": 2528814409.0, "reward": 2.552455425262451, "reward_std": 0.3744348883628845, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48843589425086975, "rewards/format_reward/mean": 0.9620535969734192, "rewards/format_reward/std": 0.191280335187912, "rewards/tag_count_reward/mean": 0.9810267686843872, "rewards/tag_count_reward/std": 0.11671038717031479, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 814.7210083007812, "completions/mean_terminated_length": 652.7752685546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9945128123168717, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1412322653176845, "kl": 0.0333251953125, "learning_rate": 1.0009077223477566e-07, "loss": 0.0584, "num_tokens": 2529248620.0, "reward": 2.5435268878936768, "reward_std": 0.33847787976264954, "rewards/accuracy_reward/mean": 0.6227678656578064, "rewards/accuracy_reward/std": 0.48523563146591187, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11096391081809998, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1060.571533203125, "completions/mean_terminated_length": 825.9889526367188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9947259069841777, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12865179422458642, "kl": 0.024078369140625, "learning_rate": 1.0008417308737438e-07, "loss": 0.0861, "num_tokens": 2529800700.0, "reward": 2.3035714626312256, "reward_std": 0.41533833742141724, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9642857313156128, "rewards/tag_count_reward/std": 0.1624998301267624, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 915.30810546875, "completions/mean_terminated_length": 756.788818359375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9949390016514836, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.12364860070357474, "kl": 0.026885986328125, "learning_rate": 1.0007782293394515e-07, "loss": 0.0372, "num_tokens": 2530278086.0, "reward": 2.5027902126312256, "reward_std": 0.38372913002967834, "rewards/accuracy_reward/mean": 0.5669642686843872, "rewards/accuracy_reward/std": 0.4960494041442871, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.11601705849170685, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 951.0313110351562, "completions/mean_terminated_length": 768.203125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9951520963187896, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11027513433627976, "kl": 0.026397705078125, "learning_rate": 1.0007172177800232e-07, "loss": 0.0746, "num_tokens": 2530782452.0, "reward": 2.5, "reward_std": 0.37320226430892944, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2067493349313736, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.1042405441403389, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1149.7857666015625, "completions/mean_terminated_length": 888.3458251953125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9953651909860955, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.11379161884338838, "kl": 0.021942138671875, "learning_rate": 1.0006586962292245e-07, "loss": 0.0925, "num_tokens": 2531373844.0, "reward": 2.3828125, "reward_std": 0.5054367184638977, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9241071343421936, "rewards/format_reward/std": 0.265122652053833, "rewards/tag_count_reward/mean": 0.9698660969734192, "rewards/tag_count_reward/std": 0.14262787997722626, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1013.1406860351562, "completions/mean_terminated_length": 850.0232543945312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9955782856534016, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.11989765254294334, "kl": 0.0294189453125, "learning_rate": 1.0006026647194422e-07, "loss": 0.0761, "num_tokens": 2531894163.0, "reward": 2.5128350257873535, "reward_std": 0.4939660429954529, "rewards/accuracy_reward/mean": 0.6473214030265808, "rewards/accuracy_reward/std": 0.4783378839492798, "rewards/format_reward/mean": 0.9084821343421936, "rewards/format_reward/std": 0.2886664867401123, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.161164328455925, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1015.5223388671875, "completions/mean_terminated_length": 780.73974609375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9957913803207075, "frac_reward_zero_std": 0.0, "grad_norm": 0.1483818037775009, "kl": 0.028076171875, "learning_rate": 1.000549123281685e-07, "loss": 0.1088, "num_tokens": 2532421309.0, "reward": 2.4776787757873535, "reward_std": 0.3812069296836853, "rewards/accuracy_reward/mean": 0.5602678656578064, "rewards/accuracy_reward/std": 0.49690937995910645, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9866071343421936, "rewards/tag_count_reward/std": 0.08420762419700623, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 932.466552734375, "completions/mean_terminated_length": 782.787353515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9960044749880135, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.1409235280739369, "kl": 0.03118896484375, "learning_rate": 1.0004980719455852e-07, "loss": 0.0547, "num_tokens": 2532905598.0, "reward": 2.4720983505249023, "reward_std": 0.39919596910476685, "rewards/accuracy_reward/mean": 0.5691964030265808, "rewards/accuracy_reward/std": 0.4957422614097595, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9720982313156128, "rewards/tag_count_reward/std": 0.1219823881983757, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 954.8795166015625, "completions/mean_terminated_length": 769.3629150390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9962175696553194, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.12562608925846927, "kl": 0.0291748046875, "learning_rate": 1.0004495107393944e-07, "loss": 0.0541, "num_tokens": 2533406760.0, "reward": 2.486049175262451, "reward_std": 0.35308435559272766, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.1057254895567894, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 820.482177734375, "completions/mean_terminated_length": 722.872314453125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9964306643226253, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.14587879980622798, "kl": 0.0325927734375, "learning_rate": 1.0004034396899885e-07, "loss": 0.0916, "num_tokens": 2533846608.0, "reward": 2.564732313156128, "reward_std": 0.40221813321113586, "rewards/accuracy_reward/mean": 0.6540178656578064, "rewards/accuracy_reward/std": 0.47621920704841614, "rewards/format_reward/mean": 0.9330357313156128, "rewards/format_reward/std": 0.2502395808696747, "rewards/tag_count_reward/mean": 0.9776785969734192, "rewards/tag_count_reward/std": 0.1136813759803772, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 944.2388916015625, "completions/mean_terminated_length": 725.8475952148438, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9966437589899313, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.13062411935876078, "kl": 0.028045654296875, "learning_rate": 1.0003598588228639e-07, "loss": 0.0788, "num_tokens": 2534335275.0, "reward": 2.47265625, "reward_std": 0.3991635739803314, "rewards/accuracy_reward/mean": 0.5558035969734192, "rewards/accuracy_reward/std": 0.4974316358566284, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24233205616474152, "rewards/tag_count_reward/mean": 0.9793526530265808, "rewards/tag_count_reward/std": 0.10962118953466415, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 960.44873046875, "completions/mean_terminated_length": 759.0502319335938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9968568536572372, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1454670215034997, "kl": 0.029815673828125, "learning_rate": 1.0003187681621395e-07, "loss": 0.091, "num_tokens": 2534839492.0, "reward": 2.4659600257873535, "reward_std": 0.4732803404331207, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9174107313156128, "rewards/format_reward/std": 0.2755681276321411, "rewards/tag_count_reward/mean": 0.9704241156578064, "rewards/tag_count_reward/std": 0.13927440345287323, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1062.759033203125, "completions/mean_terminated_length": 804.6535034179688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9970699483245432, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12514435583075872, "kl": 0.026092529296875, "learning_rate": 1.0002801677305553e-07, "loss": 0.0614, "num_tokens": 2535379688.0, "reward": 2.4051339626312256, "reward_std": 0.44050559401512146, "rewards/accuracy_reward/mean": 0.4955357015132904, "rewards/accuracy_reward/std": 0.5005390048027039, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9654017686843872, "rewards/tag_count_reward/std": 0.15299931168556213, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 999.6652221679688, "completions/mean_terminated_length": 824.9427490234375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9972830429918491, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1267200484425361, "kl": 0.02752685546875, "learning_rate": 1.000244057549474e-07, "loss": 0.0974, "num_tokens": 2535905234.0, "reward": 2.4213171005249023, "reward_std": 0.45943737030029297, "rewards/accuracy_reward/mean": 0.5223214030265808, "rewards/accuracy_reward/std": 0.5000599026679993, "rewards/format_reward/mean": 0.9308035969734192, "rewards/format_reward/std": 0.25407159328460693, "rewards/tag_count_reward/mean": 0.9681919813156128, "rewards/tag_count_reward/std": 0.14469929039478302, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 955.93310546875, "completions/mean_terminated_length": 760.5105590820312, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.9974961376591551, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.14216226081341846, "kl": 0.0277099609375, "learning_rate": 1.0002104376388805e-07, "loss": 0.1024, "num_tokens": 2536407332.0, "reward": 2.4112725257873535, "reward_std": 0.44071486592292786, "rewards/accuracy_reward/mean": 0.5370370149612427, "rewards/accuracy_reward/std": 0.49920448660850525, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13440261781215668, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 780.4620971679688, "completions/mean_terminated_length": 682.9591674804688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.997709232326461, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.1403292276309954, "kl": 0.0352783203125, "learning_rate": 1.0001793080173799e-07, "loss": 0.0491, "num_tokens": 2536827427.0, "reward": 2.625, "reward_std": 0.39604291319847107, "rewards/accuracy_reward/mean": 0.7366071343421936, "rewards/accuracy_reward/std": 0.44096609950065613, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26866820454597473, "rewards/tag_count_reward/mean": 0.9665178656578064, "rewards/tag_count_reward/std": 0.13058780133724213, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1039.7835693359375, "completions/mean_terminated_length": 813.8988647460938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9979223269937669, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12561680779151446, "kl": 0.025604248046875, "learning_rate": 1.0001506687022001e-07, "loss": 0.0466, "num_tokens": 2537368466.0, "reward": 2.361607313156128, "reward_std": 0.42108580470085144, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911531805992126, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.12733516097068787, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 929.013427734375, "completions/mean_terminated_length": 728.7737426757812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9981354216610729, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.1337003579641795, "kl": 0.0289306640625, "learning_rate": 1.0001245197091917e-07, "loss": 0.0434, "num_tokens": 2537850248.0, "reward": 2.4871652126312256, "reward_std": 0.3668181300163269, "rewards/accuracy_reward/mean": 0.5491071343421936, "rewards/accuracy_reward/std": 0.4981389045715332, "rewards/format_reward/mean": 0.9575892686843872, "rewards/format_reward/std": 0.20174959301948547, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10854540765285492, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1093.649658203125, "completions/mean_terminated_length": 833.3721923828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9983485163283788, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.11087445682988281, "kl": 0.023651123046875, "learning_rate": 1.0001008610528253e-07, "loss": 0.0371, "num_tokens": 2538411051.0, "reward": 2.3443081378936768, "reward_std": 0.4391659200191498, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.2721492052078247, "rewards/tag_count_reward/mean": 0.9715401530265808, "rewards/tag_count_reward/std": 0.13646738231182098, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 991.7053833007812, "completions/mean_terminated_length": 792.7745361328125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9985616109956849, "frac_reward_zero_std": 0.1071428656578064, "grad_norm": 0.12353653238258114, "kl": 0.026458740234375, "learning_rate": 1.0000796927461941e-07, "loss": 0.0295, "num_tokens": 2538923063.0, "reward": 2.5145089626312256, "reward_std": 0.4036351144313812, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49168136715888977, "rewards/format_reward/mean": 0.9419642686843872, "rewards/format_reward/std": 0.23407234251499176, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.10627461224794388, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 993.0022583007812, "completions/mean_terminated_length": 724.0812377929688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9987747056629908, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.12458068262714309, "kl": 0.03131103515625, "learning_rate": 1.0000610148010136e-07, "loss": 0.0352, "num_tokens": 2539431800.0, "reward": 2.4190850257873535, "reward_std": 0.37573495507240295, "rewards/accuracy_reward/mean": 0.5111607313156128, "rewards/accuracy_reward/std": 0.5004342198371887, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.2463276982307434, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.12037914246320724, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 890.85498046875, "completions/mean_terminated_length": 738.9065551757812, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9989878003302968, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.11769463533609018, "kl": 0.02886962890625, "learning_rate": 1.0000448272276205e-07, "loss": 0.0468, "num_tokens": 2539904663.0, "reward": 2.5044643878936768, "reward_std": 0.35532113909721375, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4944108724594116, "rewards/format_reward/mean": 0.9441964030265808, "rewards/format_reward/std": 0.22979861497879028, "rewards/tag_count_reward/mean": 0.9821428656578064, "rewards/tag_count_reward/std": 0.10818972438573837, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 946.6317138671875, "completions/mean_terminated_length": 769.7279663085938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.9992008949976027, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.11571966958610877, "kl": 0.026031494140625, "learning_rate": 1.0000311300349733e-07, "loss": 0.0535, "num_tokens": 2540404066.0, "reward": 2.4933037757873535, "reward_std": 0.350578248500824, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49663296341896057, "rewards/format_reward/mean": 0.9397321343421936, "rewards/format_reward/std": 0.23824848234653473, "rewards/tag_count_reward/mean": 0.9910714030265808, "rewards/tag_count_reward/std": 0.07038223743438721, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 806.700927734375, "completions/mean_terminated_length": 685.0049438476562, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9994139896649087, "frac_reward_zero_std": 0.0, "grad_norm": 0.14513044595683983, "kl": 0.030975341796875, "learning_rate": 1.0000199232306522e-07, "loss": 0.1109, "num_tokens": 2540838044.0, "reward": 2.4854912757873535, "reward_std": 0.4160224497318268, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.49542486667633057, "rewards/format_reward/mean": 0.9352678656578064, "rewards/format_reward/std": 0.24632768332958221, "rewards/tag_count_reward/mean": 0.9787946343421936, "rewards/tag_count_reward/std": 0.11389531940221786, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1119.8013916015625, "completions/mean_terminated_length": 866.65625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9996270843322146, "frac_reward_zero_std": 0.0357142873108387, "grad_norm": 0.1119688690457679, "kl": 0.0234375, "learning_rate": 1.0000112068208598e-07, "loss": 0.0653, "num_tokens": 2541412995.0, "reward": 2.3487725257873535, "reward_std": 0.4741098880767822, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.49855977296829224, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.2578272819519043, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.15056781470775604, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 967.9464721679688, "completions/mean_terminated_length": 791.2103881835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9998401789995205, "frac_reward_zero_std": 0.25, "grad_norm": 0.1208217601468528, "kl": 0.024871826171875, "learning_rate": 1.0000049808104197e-07, "loss": 0.082, "num_tokens": 2541918203.0, "reward": 2.5044643878936768, "reward_std": 0.33556851744651794, "rewards/accuracy_reward/mean": 0.5736607313156128, "rewards/accuracy_reward/std": 0.49509719014167786, "rewards/format_reward/mean": 0.9598214030265808, "rewards/format_reward/std": 0.1965973675251007, "rewards/tag_count_reward/mean": 0.9709821343421936, "rewards/tag_count_reward/std": 0.14088858664035797, "step": 4692 }, { "epoch": 0.9998401789995205, "step": 4692, "total_flos": 0.0, "train_loss": 0.08478201504697423, "train_runtime": 144692.0244, "train_samples_per_second": 0.908, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 4693, "num_input_tokens_seen": 2541918203, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }